Kemudraj

AIspit

Sep 10th, 2018
1,332
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 214.68 KB | None | 0 0
  1. #lab1 drva
  2.  
  3. """
  4.  
  5. Задача 1 Problem 1 (2 / 12)
  6. Да се промени класата за дрво на одлука да чува и информација на кое ниво во дрвото се наоѓа јазолот.
  7. Потоа да се променат и функциите за градење и печатење на дрвото така што за секој јазол ќе се печати
  8. и нивото. Коренот е на нулто ниво. На излез со функцијата printTree треба да се испечати даденото
  9. тренинг множество. Прочитана инстанца од стандарден влез да се додаде на тренинг множеството
  10. и потоа да се истренира и испечати истото.
  11.  
  12. """
  13.  
  14. trainingData=[['slashdot','USA','yes',18,'None'],
  15.         ['google','France','yes',23,'Premium'],
  16.         ['google','France','yes',23,'Basic'],
  17.         ['google','France','yes',23,'Basic'],
  18.         ['digg','USA','yes',24,'Basic'],
  19.         ['kiwitobes','France','yes',23,'Basic'],
  20.         ['google','UK','no',21,'Premium'],
  21.         ['(direct)','New Zealand','no',12,'None'],
  22.         ['(direct)','UK','no',21,'Basic'],
  23.         ['google','USA','no',24,'Premium'],
  24.         ['slashdot','France','yes',19,'None'],
  25.         ['digg','USA','no',18,'None'],
  26.         ['google','UK','no',18,'None'],
  27.         ['kiwitobes','UK','no',19,'None'],
  28.         ['digg','New Zealand','yes',12,'Basic'],
  29.         ['slashdot','UK','no',21,'None'],
  30.         ['google','UK','yes',18,'Basic'],
  31.         ['kiwitobes','France','yes',19,'Basic']]
  32.  
  33. class decisionnode:
  34.     def __init__(self, col=-1, value=None, results=None, tb=None, fb=None, l=None):
  35.         self.col = col
  36.         self.value = value
  37.         self.results = results
  38.         self.tb = tb
  39.         self.fb = fb
  40.         self.l = l
  41.  
  42.  
  43. def sporedi_broj(row, column, value):
  44.     return row[column] >= value
  45.  
  46.  
  47. def sporedi_string(row, column, value):
  48.     return row[column] == value
  49.  
  50.  
  51. # Divides a set on a specific column. Can handle numeric
  52. # or nominal values
  53. def divideset(rows, column, value):
  54.     # Make a function that tells us if a row is in
  55.     # the first group (true) or the second group (false)
  56.     split_function = None
  57.     if isinstance(value, int) or isinstance(value, float):  # ako vrednosta so koja sporeduvame e od tip int ili float
  58.         # split_function=lambda row:row[column]>=value # togas vrati funkcija cij argument e row i vrakja vrednost true ili false
  59.         split_function = sporedi_broj
  60.     else:
  61.         # split_function=lambda row:row[column]==value # ako vrednosta so koja sporeduvame e od drug tip (string)
  62.         split_function = sporedi_string
  63.  
  64.     # Divide the rows into two sets and return them
  65.     set_false = []
  66.     set_true = []
  67.     for row in rows:
  68.         if split_function(row, column, value):
  69.             set_true.append(row)
  70.         else:
  71.             set_false.append(row)
  72.     set1 = [row for row in rows if
  73.             split_function(row, column, value)]  # za sekoj row od rows za koj split_function vrakja true
  74.     set2 = [row for row in rows if
  75.             not split_function(row, column, value)]  # za sekoj row od rows za koj split_function vrakja false
  76.     # return (set1, set2)
  77.     return (set_true, set_false)
  78.  
  79.  
  80. # Create counts of possible results (the last column of
  81. # each row is the result)
  82. def uniquecounts(rows):
  83.     results = {}
  84.     for row in rows:
  85.         # The result is the last column
  86.         r = row[-1]
  87.         results.setdefault(r, 0)
  88.         results[r] += 1
  89.  
  90.     return results
  91.  
  92.  
  93. def log2(x):
  94.     from math import log
  95.     l2 = log(x) / log(2)
  96.     return l2
  97.  
  98.  
  99. def entropy(rows):
  100.     results = uniquecounts(rows)
  101.     # Now calculate the entropy
  102.     ent = 0.0
  103.     for r in results.keys():
  104.         p = float(results[r]) / len(rows)
  105.         ent = ent - p * log2(p)
  106.     return ent
  107.  
  108.  
  109. def buildtree(rows, l=-1, scoref=entropy):
  110.     if len(rows) == 0: return decisionnode()
  111.     current_score = scoref(rows)
  112.  
  113.     # Set up some variables to track the best criteria
  114.     best_gain = 0.0
  115.     best_column = -1
  116.     #best_value = None
  117.     #best_subsetf = None
  118.     #best_subsett = None
  119.     best_criteria = None
  120.     best_sets = None
  121.  
  122.  
  123.     column_count = len(rows[0]) - 1
  124.     for col in range(column_count):
  125.         # Generate the list of different values in
  126.         # this column
  127.         column_values = {}
  128.         for row in rows:
  129.             column_values[row[col]] = 1
  130.         # Now try dividing the rows up for each value
  131.         # in this column
  132.         for value in column_values:
  133.             (set1, set2) = divideset(rows, col, value)
  134.  
  135.             # Information gain
  136.             p = float(len(set1)) / len(rows)
  137.             gain = current_score - p * scoref(set1) - (1 - p) * scoref(set2)
  138.             if gain > best_gain and len(set1) > 0 and len(set2) > 0:
  139.                 best_gain = gain
  140.                 #best_column = col
  141.                 #best_value = value
  142.                 #best_subsett = set1
  143.                 #best_subsetf = set2
  144.                 best_criteria = (col, value)
  145.                 best_sets = (set1, set2)
  146.  
  147.     # Create the subbranches
  148.     if best_gain > 0:
  149.         l = l+1
  150.         trueBranch = buildtree(best_sets[0],l, scoref)
  151.         falseBranch = buildtree(best_sets[1],l, scoref)
  152.         return decisionnode(col=best_criteria[0], value=best_criteria[1],
  153.                             tb=trueBranch, fb=falseBranch,l=l)
  154.     else:
  155.         return decisionnode(results=uniquecounts(rows))
  156.  
  157.  
  158.  
  159. def printtree(tree, indent=''):
  160.     # Is this a leaf node?
  161.     if tree.results != None:
  162.         print str(tree.results)
  163.     else:
  164.         # Print the criteria
  165.         print(str(tree.col) + ':' + str(tree.value) + '? ') + 'Level=' + str(tree.l)
  166.         # Print the branches
  167.         print(indent + 'T->'),
  168.         printtree(tree.tb, indent + '  ')
  169.         print(indent + 'F->'),
  170.         printtree(tree.fb, indent + '  ')
  171.  
  172.  
  173. def classify(observation, tree):
  174.     if tree.results != None:
  175.         return tree.results
  176.     else:
  177.         vrednost = observation[tree.col]
  178.         branch = None
  179.  
  180.         if isinstance(vrednost, int) or isinstance(vrednost, float):
  181.             if vrednost >= tree.value:
  182.                 branch = tree.tb
  183.             else:
  184.                 branch = tree.fb
  185.         else:
  186.             if vrednost == tree.value:
  187.                 branch = tree.tb
  188.             else:
  189.                 branch = tree.fb
  190.  
  191.         return classify(observation, branch)
  192.  
  193.  
  194.  
  195. if __name__ == "__main__":
  196.     referrer=input()
  197.     location=input()
  198.     readFAQ=input()
  199.     pagesVisited=input()
  200.     serviceChosen=input()
  201.  
  202.     #referrer = 'google'
  203.     #location = 'UK'
  204.     #readFAQ = 'no',
  205.     #pagesVisited = 18
  206.     #serviceChosen = 'None'
  207.  
  208.  
  209.     tmp = [referrer,location,readFAQ,pagesVisited,serviceChosen]
  210.     trainingData.append(tmp)
  211.     t = buildtree(trainingData)
  212.  
  213.     printtree(t)
  214. -------------------------------------------------------------------------------------------------------
  215. #lab2 Drva
  216. """
  217. Да се промени функцијата за предвидување, така што таа ќе ја печати само класата
  218. која ја предвидува (а не речник како сега). Притоа да се проверува дали во листот
  219. има повеќе од една класа. Ако има само една класа тогаш се предвидува истата, но
  220. ако има повеќе од една треба да се испечати таа со најголем број на инстанци. Ако
  221. во листот има неколку класи со ист број на инстанци да се предвиде првата класа по азбучен ред.
  222. """
  223.  
  224. trainingData=[['slashdot','USA','yes',18,'None'],
  225.         ['google','France','yes',23,'Premium'],
  226.         ['google','France','yes',23,'Basic'],
  227.         ['google','France','yes',23,'Basic'],
  228.         ['digg','USA','yes',24,'Basic'],
  229.         ['kiwitobes','France','yes',23,'Basic'],
  230.         ['google','UK','no',21,'Premium'],
  231.         ['(direct)','New Zealand','no',12,'None'],
  232.         ['(direct)','UK','no',21,'Basic'],
  233.         ['google','USA','no',24,'Premium'],
  234.         ['slashdot','France','yes',19,'None'],
  235.         ['digg','USA','no',18,'None'],
  236.         ['google','UK','no',18,'None'],
  237.         ['kiwitobes','UK','no',19,'None'],
  238.         ['digg','New Zealand','yes',12,'Basic'],
  239.         ['slashdot','UK','no',21,'None'],
  240.         ['google','UK','yes',18,'Basic'],
  241.         ['kiwitobes','France','yes',19,'Basic']]
  242.  
  243. class decisionnode:
  244.     #lel init konstruktor?
  245.       def __init__(self,col=-1,value=None,results=None,tb=None,fb=None):
  246.          self.col=col
  247.          self.value=value
  248.          self.results=results
  249.          self.tb=tb
  250.          self.fb=fb
  251.  
  252. def sporedi_broj(row,column,value):
  253.   return row[column]>=value
  254.  
  255. def sporedi_string(row,column,value):
  256.   return row[column]==value
  257.  
  258. # Divides a set on a specific column. Can handle numeric
  259. # or nominal values
  260. def divideset(rows,column,value):
  261.     # Make a function that tells us if a row is in
  262.     # the first group (true) or the second group (false)
  263.     split_function=None #Flag za proverka
  264.     if isinstance(value,int) or isinstance(value,float):
  265.        # ako vrednosta so koja sporeduvame e od tip int ili float
  266.        split_function=sporedi_broj
  267.     else:
  268.        # ako vrednosta so koja sporeduvame e od drug tip (string)
  269.        split_function=sporedi_string
  270.  
  271.     # Divide the rows into two sets and return them
  272.     set1=[row for row in rows if split_function(row,column,value)]  # za sekoj row od rows za koj split_function vrakja true
  273.     set2=[row for row in rows if not split_function(row,column,value)] # za sekoj row od rows za koj split_function vrakja false
  274.     return (set1,set2)
  275.  
  276. # Create counts of possible results
  277. #(the last column (vertical result) of each row is the result)
  278.  
  279. def uniquecounts(rows):
  280.   results={}
  281.   for row in rows:
  282.      # The result is the last column
  283.      r=row[len(row)-1]
  284.      if r not in results:
  285.             results[r]=0
  286.      results[r]+=1
  287.   return results
  288.  
  289. # Entropy is the sum of p(x)log(p(x)) across all
  290. # the different possible results
  291. def entropy(rows):
  292.       from math import log
  293.       log2=lambda x:log(x)/log(2)
  294.       results=uniquecounts(rows)
  295.       # Now calculate the entropy
  296.       ent=0.0
  297.       for r in results.keys():
  298.             p=float(results[r])/len(rows)
  299.             ent=ent-p*log2(p)
  300.       return ent
  301.  
  302. def buildtree(rows,scoref=entropy):
  303.       if len(rows)==0: return decisionnode()
  304.       current_score=scoref(rows)
  305.      
  306.       # Set up some variables to track the best criteria
  307.       best_gain=0.0
  308.       best_criteria=None
  309.       best_sets=None
  310.  
  311.       column_count=len(rows[0])-1
  312.       for col in range(0,column_count):
  313.             # Generate the list of different values in
  314.             # this column
  315.             column_values={}
  316.             for row in rows:
  317.                   column_values[row[col]]=1
  318.                  
  319.             # Now try dividing the rows up for each value
  320.             # in this column
  321.             for value in column_values.keys():
  322.                   (set1,set2)=divideset(rows,col,value)
  323.  
  324.                   # Information gain
  325.                   p=float(len(set1))/len(rows)
  326.                   gain=current_score-p*scoref(set1)-(1-p)*scoref(set2)
  327.                   if gain>best_gain and len(set1)>0 and len(set2)>0:
  328.                         best_gain=gain
  329.                         best_criteria=(col,value)
  330.                         best_sets=(set1,set2)
  331.  
  332.       # Create the subbranches
  333.       if best_gain>0:
  334.             trueBranch=buildtree(best_sets[0])
  335.             falseBranch=buildtree(best_sets[1])
  336.             return decisionnode(col=best_criteria[0],value=best_criteria[1],tb=trueBranch, fb=falseBranch)
  337.       else:
  338.             return decisionnode(results=uniquecounts(rows))
  339.  
  340.            
  341. def classify(observation,tree):
  342.     if tree.results!=None:
  343.         recnik=tree.results;
  344.         lista=[]
  345.         for k in recnik.keys():
  346.             torka=(k,recnik[k])
  347.             lista.append(torka)
  348.         brKlasi=len(torka)
  349.         if brKlasi==1:
  350.             return lista[0][0]
  351.         lista.sort()
  352.         return lista[0][0]
  353.     else:
  354.         vrednost=observation[tree.col]
  355.         branch=None
  356.  
  357.         if isinstance(vrednost,int) or isinstance(vrednost,float):
  358.             if vrednost>=tree.value: branch=tree.tb
  359.             else: branch=tree.fb
  360.         else:
  361.            if vrednost==tree.value: branch=tree.tb
  362.            else: branch=tree.fb
  363.  
  364.     return classify(observation,branch)
  365.  
  366. if __name__ == "__main__":
  367.     # referrer='slashdot'
  368.     # location='UK'
  369.     # readFAQ='no'
  370.     # pagesVisited=21
  371.     # serviceChosen='Unknown'
  372.  
  373.     referrer=input()
  374.     location=input()
  375.     readFAQ=input()
  376.     pagesVisited=input()
  377.     serviceChosen=input()
  378.  
  379.     testCase=[referrer, location, readFAQ, pagesVisited, serviceChosen]
  380.     t=buildtree(trainingData)
  381.     print classify(testCase,t)
  382. -------------------------------------------------------------------------------------------------------
  383. #januari 2017
  384. """
  385. Да се промени алгоритмот за дрво на одлука така што ќе се изградат 2 дрва на одлука.
  386. Едното дрво на одлука ќе ја користи првата половина од податочното множество, а другото дрво,
  387. втората половина.
  388. Доколку двете дрва на одлука на тест примерот го дадат истиот резултат, да се испечати тој резултат.
  389. Доколку дадат различен резултат, да се испечати KONTRADIKCIJA.
  390. Доколку некое од дрвата има само една класа тогаш се предвидува истата,
  391. но ако има повеќе од една треба да се избере таа со најголем број на инстанци.
  392. Ако во листот има неколку класи со ист број на инстанци да се предвиде првата класа по азбучен ред.
  393. """
  394. trainingData=[
  395. [6.3,2.9,5.6,1.8,'I. virginica'],
  396. [6.5,3.0,5.8,2.2,'I. virginica'],
  397. [7.6,3.0,6.6,2.1,'I. virginica'],
  398. [4.9,2.5,4.5,1.7,'I. virginica'],
  399. [7.3,2.9,6.3,1.8,'I. virginica'],
  400. [6.7,2.5,5.8,1.8,'I. virginica'],
  401. [7.2,3.6,6.1,2.5,'I. virginica'],
  402. [6.5,3.2,5.1,2.0,'I. virginica'],
  403. [6.4,2.7,5.3,1.9,'I. virginica'],
  404. [6.8,3.0,5.5,2.1,'I. virginica'],
  405. [5.7,2.5,5.0,2.0,'I. virginica'],
  406. [5.8,2.8,5.1,2.4,'I. virginica'],
  407. [6.4,3.2,5.3,2.3,'I. virginica'],
  408. [6.5,3.0,5.5,1.8,'I. virginica'],
  409. [7.7,3.8,6.7,2.2,'I. virginica'],
  410. [7.7,2.6,6.9,2.3,'I. virginica'],
  411. [6.0,2.2,5.0,1.5,'I. virginica'],
  412. [6.9,3.2,5.7,2.3,'I. virginica'],
  413. [5.6,2.8,4.9,2.0,'I. virginica'],
  414. [7.7,2.8,6.7,2.0,'I. virginica'],
  415. [6.3,2.7,4.9,1.8,'I. virginica'],
  416. [6.7,3.3,5.7,2.1,'I. virginica'],
  417. [7.2,3.2,6.0,1.8,'I. virginica'],
  418. [6.2,2.8,4.8,1.8,'I. virginica'],
  419. [6.1,3.0,4.9,1.8,'I. virginica'],
  420. [6.4,2.8,5.6,2.1,'I. virginica'],
  421. [7.2,3.0,5.8,1.6,'I. virginica'],
  422. [7.4,2.8,6.1,1.9,'I. virginica'],
  423. [7.9,3.8,6.4,2.0,'I. virginica'],
  424. [6.4,2.8,5.6,2.2,'I. virginica'],
  425. [6.3,2.8,5.1,1.5,'I. virginica'],
  426. [6.1,2.6,5.6,1.4,'I. virginica'],
  427. [7.7,3.0,6.1,2.3,'I. virginica'],
  428. [6.3,3.4,5.6,2.4,'I. virginica'],              
  429. [5.1,3.5,1.4,0.2,'I. setosa'],
  430. [4.9,3.0,1.4,0.2,'I. setosa'],
  431. [4.7,3.2,1.3,0.2,'I. setosa'],
  432. [4.6,3.1,1.5,0.2,'I. setosa'],
  433. [5.0,3.6,1.4,0.2,'I. setosa'],
  434. [5.4,3.9,1.7,0.4,'I. setosa'],
  435. [4.6,3.4,1.4,0.3,'I. setosa'],
  436. [5.0,3.4,1.5,0.2,'I. setosa'],
  437. [4.4,2.9,1.4,0.2,'I. setosa'],
  438. [4.9,3.1,1.5,0.1,'I. setosa'],
  439. [5.4,3.7,1.5,0.2,'I. setosa'],
  440. [4.8,3.4,1.6,0.2,'I. setosa'],
  441. [4.8,3.0,1.4,0.1,'I. setosa'],
  442. [4.3,3.0,1.1,0.1,'I. setosa'],
  443. [5.8,4.0,1.2,0.2,'I. setosa'],
  444. [5.7,4.4,1.5,0.4,'I. setosa'],
  445. [5.4,3.9,1.3,0.4,'I. setosa'],
  446. [5.1,3.5,1.4,0.3,'I. setosa'],
  447. [5.7,3.8,1.7,0.3,'I. setosa'],
  448. [5.1,3.8,1.5,0.3,'I. setosa'],
  449. [5.4,3.4,1.7,0.2,'I. setosa'],
  450. [5.1,3.7,1.5,0.4,'I. setosa'],
  451. [4.6,3.6,1.0,0.2,'I. setosa'],
  452. [5.1,3.3,1.7,0.5,'I. setosa'],
  453. [4.8,3.4,1.9,0.2,'I. setosa'],
  454. [5.0,3.0,1.6,0.2,'I. setosa'],
  455. [5.0,3.4,1.6,0.4,'I. setosa'],
  456. [5.2,3.5,1.5,0.2,'I. setosa'],
  457. [5.2,3.4,1.4,0.2,'I. setosa'],
  458. [5.5,2.3,4.0,1.3,'I. versicolor'],
  459. [6.5,2.8,4.6,1.5,'I. versicolor'],
  460. [5.7,2.8,4.5,1.3,'I. versicolor'],
  461. [6.3,3.3,4.7,1.6,'I. versicolor'],
  462. [4.9,2.4,3.3,1.0,'I. versicolor'],
  463. [6.6,2.9,4.6,1.3,'I. versicolor'],
  464. [5.2,2.7,3.9,1.4,'I. versicolor'],
  465. [5.0,2.0,3.5,1.0,'I. versicolor'],
  466. [5.9,3.0,4.2,1.5,'I. versicolor'],
  467. [6.0,2.2,4.0,1.0,'I. versicolor'],
  468. [6.1,2.9,4.7,1.4,'I. versicolor'],
  469. [5.6,2.9,3.6,1.3,'I. versicolor'],
  470. [6.7,3.1,4.4,1.4,'I. versicolor'],
  471. [5.6,3.0,4.5,1.5,'I. versicolor'],
  472. [5.8,2.7,4.1,1.0,'I. versicolor'],
  473. [6.2,2.2,4.5,1.5,'I. versicolor'],
  474. [5.6,2.5,3.9,1.1,'I. versicolor'],
  475. [5.9,3.2,4.8,1.8,'I. versicolor'],
  476. [6.1,2.8,4.0,1.3,'I. versicolor'],
  477. [6.3,2.5,4.9,1.5,'I. versicolor'],
  478. [6.1,2.8,4.7,1.2,'I. versicolor'],
  479. [6.4,2.9,4.3,1.3,'I. versicolor'],
  480. [6.6,3.0,4.4,1.4,'I. versicolor'],
  481. [6.8,2.8,4.8,1.4,'I. versicolor'],
  482. [6.7,3.0,5.0,1.7,'I. versicolor'],
  483. [6.0,2.9,4.5,1.5,'I. versicolor'],
  484. [5.7,2.6,3.5,1.0,'I. versicolor'],
  485. [5.5,2.4,3.8,1.1,'I. versicolor'],
  486. [5.5,2.4,3.7,1.0,'I. versicolor'],
  487. [5.8,2.7,3.9,1.2,'I. versicolor'],
  488. [6.0,2.7,5.1,1.6,'I. versicolor'],
  489. [5.4,3.0,4.5,1.5,'I. versicolor'],
  490. [6.0,3.4,4.5,1.6,'I. versicolor'],
  491. [6.7,3.1,4.7,1.5,'I. versicolor'],
  492. [6.3,2.3,4.4,1.3,'I. versicolor'],
  493. [5.6,3.0,4.1,1.3,'I. versicolor'],
  494. [5.5,2.5,4.0,1.3,'I. versicolor'],
  495. [5.5,2.6,4.4,1.2,'I. versicolor'],
  496. [6.1,3.0,4.6,1.4,'I. versicolor'],
  497. [5.8,2.6,4.0,1.2,'I. versicolor'],
  498. [5.0,2.3,3.3,1.0,'I. versicolor'],
  499. [5.6,2.7,4.2,1.3,'I. versicolor'],
  500. [5.7,3.0,4.2,1.2,'I. versicolor'],
  501. [5.7,2.9,4.2,1.3,'I. versicolor'],
  502. [6.2,2.9,4.3,1.3,'I. versicolor'],
  503. [5.1,2.5,3.0,1.1,'I. versicolor'],
  504. [5.7,2.8,4.1,1.3,'I. versicolor'],
  505. [6.4,3.1,5.5,1.8,'I. virginica'],
  506. [6.0,3.0,4.8,1.8,'I. virginica'],
  507. [6.9,3.1,5.4,2.1,'I. virginica'],
  508. [6.7,3.1,5.6,2.4,'I. virginica'],
  509. [6.9,3.1,5.1,2.3,'I. virginica'],
  510. [5.8,2.7,5.1,1.9,'I. virginica'],
  511. [6.8,3.2,5.9,2.3,'I. virginica'],
  512. [6.7,3.3,5.7,2.5,'I. virginica'],
  513. [6.7,3.0,5.2,2.3,'I. virginica'],
  514. [6.3,2.5,5.0,1.9,'I. virginica'],
  515. [6.5,3.0,5.2,2.0,'I. virginica'],
  516. [6.2,3.4,5.4,2.3,'I. virginica'],
  517. [4.7,3.2,1.6,0.2,'I. setosa'],
  518. [4.8,3.1,1.6,0.2,'I. setosa'],
  519. [5.4,3.4,1.5,0.4,'I. setosa'],
  520. [5.2,4.1,1.5,0.1,'I. setosa'],
  521. [5.5,4.2,1.4,0.2,'I. setosa'],
  522. [4.9,3.1,1.5,0.2,'I. setosa'],
  523. [5.0,3.2,1.2,0.2,'I. setosa'],
  524. [5.5,3.5,1.3,0.2,'I. setosa'],
  525. [4.9,3.6,1.4,0.1,'I. setosa'],
  526. [4.4,3.0,1.3,0.2,'I. setosa'],
  527. [5.1,3.4,1.5,0.2,'I. setosa'],
  528. [5.0,3.5,1.3,0.3,'I. setosa'],
  529. [4.5,2.3,1.3,0.3,'I. setosa'],
  530. [4.4,3.2,1.3,0.2,'I. setosa'],
  531. [5.0,3.5,1.6,0.6,'I. setosa'],
  532. [5.1,3.8,1.9,0.4,'I. setosa'],
  533. [4.8,3.0,1.4,0.3,'I. setosa'],
  534. [5.1,3.8,1.6,0.2,'I. setosa'],
  535. [5.9,3.0,5.1,1.8,'I. virginica']
  536. ]
  537.  
  538. class decisionnode:
  539.     def __init__(self, col=-1, value=None, results=None, tb=None, fb=None):
  540.         self.col = col
  541.         self.value = value
  542.         self.results = results
  543.         self.tb = tb
  544.         self.fb = fb
  545.  
  546.  
  547. def sporedi_broj(row, column, value):
  548.     return row[column] >= value
  549.  
  550.  
  551. def sporedi_string(row, column, value):
  552.     return row[column] == value
  553.  
  554.  
  555. # Divides a set on a specific column. Can handle numeric
  556. # or nominal values
  557. def divideset(rows, column, value):
  558.     # Make a function that tells us if a row is in
  559.     # the first group (true) or the second group (false)
  560.     split_function = None
  561.     if isinstance(value, int) or isinstance(value, float):  # ako vrednosta so koja sporeduvame e od tip int ili float
  562.         # split_function=lambda row:row[column]>=value # togas vrati funkcija cij argument e row i vrakja vrednost true ili false
  563.         split_function = sporedi_broj
  564.     else:
  565.         # split_function=lambda row:row[column]==value # ako vrednosta so koja sporeduvame e od drug tip (string)
  566.         split_function = sporedi_string
  567.  
  568.     # Divide the rows into two sets and return them
  569.     set_false = []
  570.     set_true = []
  571.     for row in rows:
  572.         if split_function(row, column, value):
  573.             set_true.append(row)
  574.         else:
  575.             set_false.append(row)
  576.     set1 = [row for row in rows if
  577.             split_function(row, column, value)]  # za sekoj row od rows za koj split_function vrakja true
  578.     set2 = [row for row in rows if
  579.             not split_function(row, column, value)]  # za sekoj row od rows za koj split_function vrakja false
  580.     # return (set1, set2)
  581.     return (set_true, set_false)
  582.  
  583. def uniquecounts(rows):
  584.     results = {}
  585.     for row in rows:
  586.         # The result is the last column
  587.         r = row[-1]
  588.         results.setdefault(r, 0)
  589.         results[r] += 1
  590.  
  591.     return results
  592.  
  593. def log2(x):
  594.     from math import log
  595.     l2 = log(x) / log(2)
  596.     return l2
  597.  
  598. def entropy(rows):
  599.     results = uniquecounts(rows)
  600.     # Now calculate the entropy
  601.     ent = 0.0
  602.     for r in results.keys():
  603.         p = float(results[r]) / len(rows)
  604.         ent = ent - p * log2(p)
  605.     return ent
  606. def buildtree(rows, scoref=entropy):
  607.     if len(rows) == 0: return decisionnode()
  608.     current_score = scoref(rows)
  609.  
  610.     # Set up some variables to track the best criteria
  611.     best_gain = 0.0
  612.     best_column = -1
  613.     best_value = None
  614.     best_subsetf = None
  615.     best_subsett = None
  616.  
  617.     column_count = len(rows[0]) - 1
  618.     for col in range(column_count):
  619.         # Generate the list of different values in
  620.         # this column
  621.         column_values = set()
  622.         for row in rows:
  623.             column_values.add(row[col])
  624.         # Now try dividing the rows up for each value
  625.         # in this column
  626.         for value in column_values:
  627.             (set1, set2) = divideset(rows, col, value)
  628.  
  629.             # Information gain
  630.             p = float(len(set1)) / len(rows)
  631.             gain = current_score - p * scoref(set1) - (1 - p) * scoref(set2)
  632.             if gain > best_gain and len(set1) > 0 and len(set2) > 0:
  633.                 best_gain = gain
  634.                 best_column = col
  635.                 best_value = value
  636.                 best_subsett = set1
  637.                 best_subsetf = set2
  638.                 # best_criteria = (col, value)
  639.                 # best_sets = (set1, set2)
  640.  
  641.     # Create the subbranches
  642.     if best_gain > 0:
  643.         trueBranch = buildtree(best_subsett, scoref)
  644.         falseBranch = buildtree(best_subsetf, scoref)
  645.         return decisionnode(col=best_column, value=best_value,
  646.                             tb=trueBranch, fb=falseBranch)
  647.     else:
  648.         return decisionnode(results=uniquecounts(rows))
  649.    
  650. def classify(observation, tree):
  651.         if tree.results != None:
  652.             return tree.results
  653.         else:
  654.             vrednost = observation[tree.col]
  655.             branch = None
  656.  
  657.             if isinstance(vrednost, int) or isinstance(vrednost, float):
  658.                 if vrednost >= tree.value:
  659.                     branch = tree.tb
  660.                 else:
  661.                     branch = tree.fb
  662.             else:
  663.                 if vrednost == tree.value:
  664.                     branch = tree.tb
  665.                 else:
  666.                     branch = tree.fb
  667.  
  668.             return classify(observation, branch)
  669. if __name__ == "__main__":
  670.    
  671.  
  672.     att1=input()
  673.     att2=input()
  674.     att3=input()
  675.     att4=input()
  676.     planttype=input()
  677.     testCase=[att1,att2,att3,att4,planttype]
  678.     mn1=[] #kreiranje prvo mnozestvo
  679.     mn2=[] #kreiranje vtoro mnozestvo
  680.     vkupno=len(trainingData) #vkupno redovi
  681.     for i in range(0,vkupno/2): #polnenje na prvo trening mnozhestvo
  682.         mn1.append(trainingData[i])
  683.     for i in range(vkupno/2,vkupno): # polnenje na vtoro trening mnozhestvo
  684.         mn2.append(trainingData[i])
  685.     drvo1=buildtree(mn1) #kreiranje prvo drvo od prvo trening mnozhestvo
  686.     drvo2=buildtree(mn2) #kreiranje vtoro drvo od vtoro trening mnozhestvo
  687.     kl1=classify(testCase,drvo1) #prva klasifikacija
  688.     kl2=classify(testCase,drvo2) #vtora klasifikacija
  689.     if (kl1.keys()[0]==kl2.keys()[0]): #proverka dali se isti
  690.         print kl1.keys()[0]
  691.     if(kl1.values()[0]>kl2.values()[0]):
  692.         print kl1.keys()[0]
  693.         print 'KONTRADIKCIJA'
  694.     if(kl2.values()[0]>kl1.values()[0]):
  695.         print kl1.keys()[0]
  696.         print 'KONTRADIKCIJA'
  697. -------------------------------------------------------------------------------------------------------
  698. #januari 2018 - Drva
  699. Да се промени функцијата за предвидување, така што при изминувањето ќе печати информации за:
  700. -со која колона и вредност се споредува
  701. -за која е тековната вредност на тест примерокот за бараната колона
  702. -нивото на тековниот јазол во дрвото
  703. -која е следната гранка што ќе се изминува низ дрвото (True branch или False branch)
  704. -преостанатиот дел од дрвото што треба да се измине
  705. -празна линија
  706.  
  707. Потоа да се испечати истренираното дрво, да се вчита непознат тренинг примерок од стандардниот влез
  708. и истиот да се класифицира со новата функција за предвидување.
  709.  
  710. trainingData=[['twitter','USA','yes',18,'None'],
  711.         ['google','France','yes',23,'Premium'],
  712.         ['google','France','no',26,'Basic'],
  713.         ['google','Macedonia','yes',13,'None'],
  714.         ['pinterest','USA','yes',24,'Basic'],
  715.         ['bing','France','yes',23,'Basic'],
  716.         ['google','UK','no',21,'Premium'],
  717.         ['facebook','New Zealand','no',12,'None'],
  718.         ['facebook','UK','no',21,'Basic'],
  719.         ['google','USA','no',24,'Premium'],
  720.         ['twitter','France','yes',19,'None'],
  721.         ['pinterest','USA','no',18,'None'],
  722.         ['google','UK','no',18,'None'],
  723.         ['bing','UK','yes',19,'Premium'],
  724.         ['bing','Macedonia','no',10,'None'],
  725.         ['facebook','Macedonia','no',16,'Basic'],
  726.         ['bing','UK','no',19,'Basic'],
  727.         ['pinterest','Germany','no',2,'None'],
  728.         ['pinterest','USA','yes',12,'Basic'],
  729.         ['twitter','UK','no',21,'None'],
  730.         ['twitter','UK','yes',26,'Premium'],
  731.         ['google','UK','yes',18,'Basic'],
  732.         ['bing','France','yes',19,'Basic']]
  733.  
  734. test_cases=[['google','MK','no',24,'Unknown'],
  735.             ['google','MK','no',15,'Unknown'],
  736.             ['pinterest','UK','yes',21,'Unknown'],
  737.             ['pinterest','UK','no',25,'Unknown']]
  738.  
  739. # trainingData=[line.split('\t') for line in file('decision_tree_example.txt')]
  740.  
  741. class decisionnode:
  742.       def __init__(self,col=-1,value=None,results=None,tb=None,fb=None,level=0):
  743.          self.level=level
  744.          self.col=col
  745.          self.value=value
  746.          self.results=results
  747.          self.tb=tb
  748.          self.fb=fb
  749.          self.level=level
  750.  
  751. def sporedi_broj(row,column,value):
  752.   return row[column]>=value
  753.  
  754. def sporedi_string(row,column,value):
  755.   return row[column]==value
  756.  
  757. # Divides a set on a specific column. Can handle numeric
  758. # or nominal values
  759. def divideset(rows,column,value):
  760.     # Make a function that tells us if a row is in
  761.     # the first group (true) or the second group (false)
  762.     split_function=None
  763.     if isinstance(value,int) or isinstance(value,float): # ako vrednosta so koja sporeduvame e od tip int ili float
  764.        #split_function=lambda row:row[column]>=value # togas vrati funkcija cij argument e row i vrakja vrednost true ili false
  765.        split_function=sporedi_broj
  766.     else:
  767.        # split_function=lambda row:row[column]==value # ako vrednosta so koja sporeduvame e od drug tip (string)
  768.        split_function=sporedi_string
  769.  
  770.     # Divide the rows into two sets and return them
  771.     # set1=[row for row in rows if split_function(row)]  # za sekoj row od rows za koj split_function vrakja true
  772.     # set2=[row for row in rows if not split_function(row)] # za sekoj row od rows za koj split_function vrakja false
  773.     set1=[row for row in rows if split_function(row,column,value)]  # za sekoj row od rows za koj split_function vrakja true
  774.     set2=[row for row in rows if not split_function(row,column,value)] # za sekoj row od rows za koj split_function vrakja false
  775.    
  776.     return (set1,set2)
  777.  
  778. # Divides a set on a specific column. Can handle numeric
  779. # or nominal values
  780. def divideset2(rows,column,value):
  781.     # Make a function that tells us if a row is in
  782.     # the first group (true) or the second group (false)
  783.     split_function=None
  784.     if isinstance(value,int) or isinstance(value,float): # ako vrednosta so koja sporeduvame e od tip int ili float
  785.        #split_function=lambda row:row[column]>=value # togas vrati funkcija cij argument e row i vrakja vrednost true ili false
  786.        split_function=sporedi_broj
  787.     else:
  788.        # split_function=lambda row:row[column]==value # ako vrednosta so koja sporeduvame e od drug tip (string)
  789.        split_function=sporedi_string
  790.  
  791.     # Divide the rows into two sets and return them
  792.     # set1=[row for row in rows if split_function(row)]  # za sekoj row od rows za koj split_function vrakja true
  793.     # set2=[row for row in rows if not split_function(row)] # za sekoj row od rows za koj split_function vrakja false
  794.     set1=[]
  795.     set2=[]
  796.     for row in rows:
  797.       if split_function(row,column,value):
  798.         set1.append(row)
  799.       else:
  800.         set2.append(row)
  801.     return (set1,set2)
  802.  
  803.  
  804.  
  805. # Create counts of possible results (the last column of
  806. # each row is the result)
  807. def uniquecounts(rows):
  808.   results={}
  809.   for row in rows:
  810.      # The result is the last column
  811.      r=row[len(row)-1]
  812.      if r not in results: results[r]=0
  813.      results[r]+=1
  814.   return results
  815.  
  816.  
  817. # Entropy is the sum of p(x)log(p(x)) across all
  818. # the different possible results
  819. def entropy(rows):
  820.       from math import log
  821.       log2=lambda x:log(x)/log(2)
  822.       results=uniquecounts(rows)
  823.       # Now calculate the entropy
  824.       ent=0.0
  825.       for r in results.keys():
  826.             p=float(results[r])/len(rows)
  827.             ent=ent-p*log2(p)
  828.       return ent
  829.  
  830. def buildtree(rows,scoref=entropy,level=0):
  831.       if len(rows)==0: return decisionnode()
  832.       current_score=scoref(rows)
  833.  
  834.       # Set up some variables to track the best criteria
  835.       best_gain=0.0
  836.       best_criteria=None
  837.       best_sets=None
  838.      
  839.       column_count=len(rows[0])-1
  840.       for col in range(0,column_count):
  841.             # Generate the list of different values in
  842.             # this column
  843.             column_values={}
  844.             for row in rows:
  845.                   column_values[row[col]]=1
  846.                   # print row[col]
  847.             # print
  848.             # print column_values
  849.             # Now try dividing the rows up for each value
  850.             # in this column
  851.             for value in column_values.keys():
  852.                   (set1,set2)=divideset(rows,col,value)
  853.                  
  854.                   # Information gain
  855.                   p=float(len(set1))/len(rows)
  856.                   gain=current_score-p*scoref(set1)-(1-p)*scoref(set2)
  857.                   # print set1, set2, gain
  858.                   if gain>best_gain and len(set1)>0 and len(set2)>0:
  859.                         best_gain=gain
  860.                         best_criteria=(col,value)
  861.                         best_sets=(set1,set2)
  862.      
  863.       # Create the subbranches
  864.       if best_gain>0:
  865.             trueBranch=buildtree(best_sets[0],level=level+1)
  866.             falseBranch=buildtree(best_sets[1], level=level+1)
  867.             return decisionnode(col=best_criteria[0],value=best_criteria[1],
  868.                             tb=trueBranch, fb=falseBranch, level=level)
  869.       else:
  870.             return decisionnode(results=uniquecounts(rows))
  871.  
  872. def printtree(tree,indent=''):
  873.       # Is this a leaf node?
  874.       if tree.results!=None:
  875.             print str(tree.results)
  876.       else:
  877.             # Print the criteria
  878.             print str(tree.col)+':'+str(tree.value)+'?' + ' Level='+str(tree.level)
  879.             # Print the branches
  880.             print indent+'T->',
  881.             printtree(tree.tb,indent+'  ')
  882.             print indent+'F->',
  883.             printtree(tree.fb,indent+'  ')
  884.  
  885. def classify(observation,tree):
  886.     if tree.results!=None:
  887.         results=[(value,key) for key,value in tree.results.items()]
  888.         results.sort()
  889.         return results[0][1]
  890.     else:
  891.         vrednost=observation[tree.col]
  892.         branch=None
  893.  
  894.         if isinstance(vrednost,int) or isinstance(vrednost,float):
  895.             if vrednost>=tree.value: branch=tree.tb
  896.             else: branch=tree.fb
  897.         else:
  898.            if vrednost==tree.value: branch=tree.tb
  899.            else: branch=tree.fb
  900.  
  901.         return classify(observation,branch)
  902.        
  903.  
  904. def classify2(observation,tree):
  905.     if tree.results!=None:
  906.         results=[(value,key) for key,value in tree.results.items()]
  907.         results.sort()
  908.         return results[0][1]
  909.     else:
  910.         vrednost=observation[tree.col]
  911.         branch=None
  912.  
  913.         if isinstance(vrednost,int) or isinstance(vrednost,float):
  914.             if vrednost>=tree.value: branch=tree.tb
  915.             else: branch=tree.fb
  916.         else:
  917.            if vrednost==tree.value: branch=tree.tb
  918.            else: branch=tree.fb
  919.  
  920.         return classify2(observation,branch)
  921.  
  922. def classify3(observation,tree):
  923.     if tree.results!=None:
  924.         results=[(value,key) for key,value in tree.results.items()]
  925.         results.sort()
  926.         return results[0][1]
  927.     else:
  928.         vrednost=observation[tree.col]
  929.         branch=None
  930.         granka='True branch'
  931.         if isinstance(vrednost,int) or isinstance(vrednost,float):
  932.             if vrednost>=tree.value:
  933.                 branch=tree.tb
  934.             else:
  935.                 branch=tree.fb
  936.                 granka='False branch'
  937.         else:
  938.            if vrednost==tree.value:
  939.                branch=tree.tb
  940.            else:
  941.                branch=tree.fb
  942.                granka='False branch'
  943.         print 'Sporeduvam kolona i vrednost', (tree.col, tree.value)
  944.         print 'Tekovna vrednost:', vrednost
  945.         print 'Sledna granka:',granka
  946.         print 'Preostanata granka za izminuvanje:'
  947.         printtree(branch)
  948.         print
  949.         return classify3(observation,branch)
  950.  
  951. if __name__ == "__main__":
  952.     referrer=input()
  953.     location=input()
  954.     readFAQ=input()
  955.     pagesVisited=input()
  956.     serviceChosen='Unknown'
  957.  
  958.  
  959.     testCase=[referrer, location, readFAQ, pagesVisited, serviceChosen]
  960.  
  961.     t=buildtree(trainingData)
  962.     printtree(t)
  963.     print classify3(testCase,t)
  964.  
  965. --------------------------------------------------------------------------------------------------------
  966. # -*- coding: utf-8 -*-
  967.  
  968.  
  969. #Дадено е податочно множество од риби кое ги содржи следните видови:
  970.  
  971. #    Code Finnish  Swedish    English        Latin
  972.  
  973.    #  1   Lahna    Braxen     Bream          Abramis brama
  974.  #    2   Siika    Iiden      Whitewish      Leusiscus idus
  975.    #  3   Saerki   Moerten    Roach          Leuciscus rutilus
  976.  #    4   Parkki   Bjoerknan  ?              Abramis bjrkna
  977.   #   5   Norssi   Norssen    Smelt          Osmerus eperlanus
  978.    #  6   Hauki    Jaedda     Pike           Esox lucius
  979.    #  7   Ahven    Abborre    Perch          Perca fluviatilis
  980. #Дадени се следните атрибути:
  981.  
  982.   #  0  Weight      Weight of the fish (in grams)
  983.   #  1  Length1     Length from the nose to the beginning of the tail (in cm)
  984.   #  2  Length2     Length from the nose to the notch of the tail (in cm)
  985.   #  3  Length3     Length from the nose to the end of the tail (in cm)
  986.    # 4  Height%     Maximal height as % of Length3
  987.    # 5  Width%      Maximal width as % of Length3
  988. #Класата е дадена во последната колона.
  989.  
  990. #Да се направи модел за класификација за даденото податочно множество. За тренинг да се земат
  991. #само првите 5 примероци од секоја од класите во множеството. Притоа ова да се направи во програмата,
  992. # а не со рачно копирање! Да се класифицира елементот од податочното множество даден на влез и да се
  993. # испечати предвидувањето. Елементот е даден со индексот од податочното множество.
  994.  
  995.  
  996.  
  997. data = [[242.0, 23.2, 25.4, 30.0, 38.4, 13.4, 1],
  998.         [290.0, 24.0, 26.3, 31.2, 40.0, 13.8, 1],
  999.         [340.0, 23.9, 26.5, 31.1, 39.8, 15.1, 1],
  1000.         [363.0, 26.3, 29.0, 33.5, 38.0, 13.3, 1],
  1001.         [430.0, 26.5, 29.0, 34.0, 36.6, 15.1, 1],
  1002.         [450.0, 26.8, 29.7, 34.7, 39.2, 14.2, 1],
  1003.         [500.0, 26.8, 29.7, 34.5, 41.1, 15.3, 1],
  1004.         [390.0, 27.6, 30.0, 35.0, 36.2, 13.4, 1],
  1005.         [450.0, 27.6, 30.0, 35.1, 39.9, 13.8, 1],
  1006.         [500.0, 28.5, 30.7, 36.2, 39.3, 13.7, 1],
  1007.         [475.0, 28.4, 31.0, 36.2, 39.4, 14.1, 1],
  1008.         [500.0, 28.7, 31.0, 36.2, 39.7, 13.3, 1],
  1009.         [500.0, 29.1, 31.5, 36.4, 37.8, 12.0, 1],
  1010.         [500.0, 29.5, 32.0, 37.3, 37.3, 13.6, 1],
  1011.         [600.0, 29.4, 32.0, 37.2, 40.2, 13.9, 1],
  1012.         [600.0, 29.4, 32.0, 37.2, 41.5, 15.0, 1],
  1013.         [700.0, 30.4, 33.0, 38.3, 38.8, 13.8, 1],
  1014.         [700.0, 30.4, 33.0, 38.5, 38.8, 13.5, 1],
  1015.         [610.0, 30.9, 33.5, 38.6, 40.5, 13.3, 1],
  1016.         [650.0, 31.0, 33.5, 38.7, 37.4, 14.8, 1],
  1017.         [575.0, 31.3, 34.0, 39.5, 38.3, 14.1, 1],
  1018.         [685.0, 31.4, 34.0, 39.2, 40.8, 13.7, 1],
  1019.         [620.0, 31.5, 34.5, 39.7, 39.1, 13.3, 1],
  1020.         [680.0, 31.8, 35.0, 40.6, 38.1, 15.1, 1],
  1021.         [700.0, 31.9, 35.0, 40.5, 40.1, 13.8, 1],
  1022.         [725.0, 31.8, 35.0, 40.9, 40.0, 14.8, 1],
  1023.         [720.0, 32.0, 35.0, 40.6, 40.3, 15.0, 1],
  1024.         [714.0, 32.7, 36.0, 41.5, 39.8, 14.1, 1],
  1025.         [850.0, 32.8, 36.0, 41.6, 40.6, 14.9, 1],
  1026.         [1000.0, 33.5, 37.0, 42.6, 44.5, 15.5, 1],
  1027.         [920.0, 35.0, 38.5, 44.1, 40.9, 14.3, 1],
  1028.         [955.0, 35.0, 38.5, 44.0, 41.1, 14.3, 1],
  1029.         [925.0, 36.2, 39.5, 45.3, 41.4, 14.9, 1],
  1030.         [975.0, 37.4, 41.0, 45.9, 40.6, 14.7, 1],
  1031.         [950.0, 38.0, 41.0, 46.5, 37.9, 13.7, 1],
  1032.         [270.0, 23.6, 26.0, 28.7, 29.2, 14.8, 2],
  1033.         [270.0, 24.1, 26.5, 29.3, 27.8, 14.5, 2],
  1034.         [306.0, 25.6, 28.0, 30.8, 28.5, 15.2, 2],
  1035.         [540.0, 28.5, 31.0, 34.0, 31.6, 19.3, 2],
  1036.         [800.0, 33.7, 36.4, 39.6, 29.7, 16.6, 2],
  1037.         [1000.0, 37.3, 40.0, 43.5, 28.4, 15.0, 2],
  1038.         [40.0, 12.9, 14.1, 16.2, 25.6, 14.0, 3],
  1039.         [69.0, 16.5, 18.2, 20.3, 26.1, 13.9, 3],
  1040.         [78.0, 17.5, 18.8, 21.2, 26.3, 13.7, 3],
  1041.         [87.0, 18.2, 19.8, 22.2, 25.3, 14.3, 3],
  1042.         [120.0, 18.6, 20.0, 22.2, 28.0, 16.1, 3],
  1043.         [0.0, 19.0, 20.5, 22.8, 28.4, 14.7, 3],
  1044.         [110.0, 19.1, 20.8, 23.1, 26.7, 14.7, 3],
  1045.         [120.0, 19.4, 21.0, 23.7, 25.8, 13.9, 3],
  1046.         [150.0, 20.4, 22.0, 24.7, 23.5, 15.2, 3],
  1047.         [145.0, 20.5, 22.0, 24.3, 27.3, 14.6, 3],
  1048.         [160.0, 20.5, 22.5, 25.3, 27.8, 15.1, 3],
  1049.         [140.0, 21.0, 22.5, 25.0, 26.2, 13.3, 3],
  1050.         [160.0, 21.1, 22.5, 25.0, 25.6, 15.2, 3],
  1051.         [169.0, 22.0, 24.0, 27.2, 27.7, 14.1, 3],
  1052.         [161.0, 22.0, 23.4, 26.7, 25.9, 13.6, 3],
  1053.         [200.0, 22.1, 23.5, 26.8, 27.6, 15.4, 3],
  1054.         [180.0, 23.6, 25.2, 27.9, 25.4, 14.0, 3],
  1055.         [290.0, 24.0, 26.0, 29.2, 30.4, 15.4, 3],
  1056.         [272.0, 25.0, 27.0, 30.6, 28.0, 15.6, 3],
  1057.         [390.0, 29.5, 31.7, 35.0, 27.1, 15.3, 3],
  1058.         [55.0, 13.5, 14.7, 16.5, 41.5, 14.1, 4],
  1059.         [60.0, 14.3, 15.5, 17.4, 37.8, 13.3, 4],
  1060.         [90.0, 16.3, 17.7, 19.8, 37.4, 13.5, 4],
  1061.         [120.0, 17.5, 19.0, 21.3, 39.4, 13.7, 4],
  1062.         [150.0, 18.4, 20.0, 22.4, 39.7, 14.7, 4],
  1063.         [140.0, 19.0, 20.7, 23.2, 36.8, 14.2, 4],
  1064.         [170.0, 19.0, 20.7, 23.2, 40.5, 14.7, 4],
  1065.         [145.0, 19.8, 21.5, 24.1, 40.4, 13.1, 4],
  1066.         [200.0, 21.2, 23.0, 25.8, 40.1, 14.2, 4],
  1067.         [273.0, 23.0, 25.0, 28.0, 39.6, 14.8, 4],
  1068.         [300.0, 24.0, 26.0, 29.0, 39.2, 14.6, 4],
  1069.         [6.7, 9.3, 9.8, 10.8, 16.1, 9.7, 5],
  1070.         [7.5, 10.0, 10.5, 11.6, 17.0, 10.0, 5],
  1071.         [7.0, 10.1, 10.6, 11.6, 14.9, 9.9, 5],
  1072.         [9.7, 10.4, 11.0, 12.0, 18.3, 11.5, 5],
  1073.         [9.8, 10.7, 11.2, 12.4, 16.8, 10.3, 5],
  1074.         [8.7, 10.8, 11.3, 12.6, 15.7, 10.2, 5],
  1075.         [10.0, 11.3, 11.8, 13.1, 16.9, 9.8, 5],
  1076.         [9.9, 11.3, 11.8, 13.1, 16.9, 8.9, 5],
  1077.         [9.8, 11.4, 12.0, 13.2, 16.7, 8.7, 5],
  1078.         [12.2, 11.5, 12.2, 13.4, 15.6, 10.4, 5],
  1079.         [13.4, 11.7, 12.4, 13.5, 18.0, 9.4, 5],
  1080.         [12.2, 12.1, 13.0, 13.8, 16.5, 9.1, 5],
  1081.         [19.7, 13.2, 14.3, 15.2, 18.9, 13.6, 5],
  1082.         [19.9, 13.8, 15.0, 16.2, 18.1, 11.6, 5],
  1083.         [200.0, 30.0, 32.3, 34.8, 16.0, 9.7, 6],
  1084.         [300.0, 31.7, 34.0, 37.8, 15.1, 11.0, 6],
  1085.         [300.0, 32.7, 35.0, 38.8, 15.3, 11.3, 6],
  1086.         [300.0, 34.8, 37.3, 39.8, 15.8, 10.1, 6],
  1087.         [430.0, 35.5, 38.0, 40.5, 18.0, 11.3, 6],
  1088.         [345.0, 36.0, 38.5, 41.0, 15.6, 9.7, 6],
  1089.         [456.0, 40.0, 42.5, 45.5, 16.0, 9.5, 6],
  1090.         [510.0, 40.0, 42.5, 45.5, 15.0, 9.8, 6],
  1091.         [540.0, 40.1, 43.0, 45.8, 17.0, 11.2, 6],
  1092.         [500.0, 42.0, 45.0, 48.0, 14.5, 10.2, 6],
  1093.         [567.0, 43.2, 46.0, 48.7, 16.0, 10.0, 6],
  1094.         [770.0, 44.8, 48.0, 51.2, 15.0, 10.5, 6],
  1095.         [950.0, 48.3, 51.7, 55.1, 16.2, 11.2, 6],
  1096.         [1250.0, 52.0, 56.0, 59.7, 17.9, 11.7, 6],
  1097.         [1600.0, 56.0, 60.0, 64.0, 15.0, 9.6, 6],
  1098.         [1550.0, 56.0, 60.0, 64.0, 15.0, 9.6, 6],
  1099.         [1650.0, 59.0, 63.4, 68.0, 15.9, 11.0, 6],
  1100.         [5.9, 7.5, 8.4, 8.8, 24.0, 16.0, 7],
  1101.         [32.0, 12.5, 13.7, 14.7, 24.0, 13.6, 7],
  1102.         [40.0, 13.8, 15.0, 16.0, 23.9, 15.2, 7],
  1103.         [51.5, 15.0, 16.2, 17.2, 26.7, 15.3, 7],
  1104.         [70.0, 15.7, 17.4, 18.5, 24.8, 15.9, 7],
  1105.         [100.0, 16.2, 18.0, 19.2, 27.2, 17.3, 7],
  1106.         [78.0, 16.8, 18.7, 19.4, 26.8, 16.1, 7],
  1107.         [80.0, 17.2, 19.0, 20.2, 27.9, 15.1, 7],
  1108.         [85.0, 17.8, 19.6, 20.8, 24.7, 14.6, 7],
  1109.         [85.0, 18.2, 20.0, 21.0, 24.2, 13.2, 7],
  1110.         [110.0, 19.0, 21.0, 22.5, 25.3, 15.8, 7],
  1111.         [115.0, 19.0, 21.0, 22.5, 26.3, 14.7, 7],
  1112.         [125.0, 19.0, 21.0, 22.5, 25.3, 16.3, 7],
  1113.         [130.0, 19.3, 21.3, 22.8, 28.0, 15.5, 7],
  1114.         [120.0, 20.0, 22.0, 23.5, 26.0, 14.5, 7],
  1115.         [120.0, 20.0, 22.0, 23.5, 24.0, 15.0, 7],
  1116.         [130.0, 20.0, 22.0, 23.5, 26.0, 15.0, 7],
  1117.         [135.0, 20.0, 22.0, 23.5, 25.0, 15.0, 7],
  1118.         [110.0, 20.0, 22.0, 23.5, 23.5, 17.0, 7],
  1119.         [130.0, 20.5, 22.5, 24.0, 24.4, 15.1, 7],
  1120.         [150.0, 20.5, 22.5, 24.0, 28.3, 15.1, 7],
  1121.         [145.0, 20.7, 22.7, 24.2, 24.6, 15.0, 7],
  1122.         [150.0, 21.0, 23.0, 24.5, 21.3, 14.8, 7],
  1123.         [170.0, 21.5, 23.5, 25.0, 25.1, 14.9, 7],
  1124.         [225.0, 22.0, 24.0, 25.5, 28.6, 14.6, 7],
  1125.         [145.0, 22.0, 24.0, 25.5, 25.0, 15.0, 7],
  1126.         [188.0, 22.6, 24.6, 26.2, 25.7, 15.9, 7],
  1127.         [180.0, 23.0, 25.0, 26.5, 24.3, 13.9, 7],
  1128.         [197.0, 23.5, 25.6, 27.0, 24.3, 15.7, 7],
  1129.         [218.0, 25.0, 26.5, 28.0, 25.6, 14.8, 7],
  1130.         [300.0, 25.2, 27.3, 28.7, 29.0, 17.9, 7],
  1131.         [260.0, 25.4, 27.5, 28.9, 24.8, 15.0, 7],
  1132.         [265.0, 25.4, 27.5, 28.9, 24.4, 15.0, 7],
  1133.         [250.0, 25.4, 27.5, 28.9, 25.2, 15.8, 7],
  1134.         [250.0, 25.9, 28.0, 29.4, 26.6, 14.3, 7],
  1135.         [300.0, 26.9, 28.7, 30.1, 25.2, 15.4, 7],
  1136.         [320.0, 27.8, 30.0, 31.6, 24.1, 15.1, 7],
  1137.         [514.0, 30.5, 32.8, 34.0, 29.5, 17.7, 7],
  1138.         [556.0, 32.0, 34.5, 36.5, 28.1, 17.5, 7],
  1139.         [840.0, 32.5, 35.0, 37.3, 30.8, 20.9, 7],
  1140.         [685.0, 34.0, 36.5, 39.0, 27.9, 17.6, 7],
  1141.         [700.0, 34.0, 36.0, 38.3, 27.7, 17.6, 7],
  1142.         [700.0, 34.5, 37.0, 39.4, 27.5, 15.9, 7],
  1143.         [690.0, 34.6, 37.0, 39.3, 26.9, 16.2, 7],
  1144.         [900.0, 36.5, 39.0, 41.4, 26.9, 18.1, 7],
  1145.         [650.0, 36.5, 39.0, 41.4, 26.9, 14.5, 7],
  1146.         [820.0, 36.6, 39.0, 41.3, 30.1, 17.8, 7],
  1147.         [850.0, 36.9, 40.0, 42.3, 28.2, 16.8, 7],
  1148.         [900.0, 37.0, 40.0, 42.5, 27.6, 17.0, 7],
  1149.         [1015.0, 37.0, 40.0, 42.4, 29.2, 17.6, 7],
  1150.         [820.0, 37.1, 40.0, 42.5, 26.2, 15.6, 7],
  1151.         [1100.0, 39.0, 42.0, 44.6, 28.7, 15.4, 7],
  1152.         [1000.0, 39.8, 43.0, 45.2, 26.4, 16.1, 7],
  1153.         [1100.0, 40.1, 43.0, 45.5, 27.5, 16.3, 7],
  1154.         [1000.0, 40.2, 43.5, 46.0, 27.4, 17.7, 7],
  1155.         [1000.0, 41.1, 44.0, 46.6, 26.8, 16.3, 7]]
  1156.  
  1157.  
  1158.  
  1159. class decisionnode:
  1160.     def __init__(self, col=-1, value=None, results=None, tb=None, fb=None):
  1161.         self.col = col
  1162.         self.value = value
  1163.         self.results = results
  1164.         self.tb = tb
  1165.         self.fb = fb
  1166.  
  1167.  
  1168. def sporedi_broj(row, column, value):
  1169.     return row[column] >= value
  1170.  
  1171.  
  1172. def sporedi_string(row, column, value):
  1173.     return row[column] == value
  1174.  
  1175.  
  1176. # Divides a set on a specific column. Can handle numeric
  1177. # or nominal values
  1178. def divideset(rows, column, value):
  1179.     # Make a function that tells us if a row is in
  1180.     # the first group (true) or the second group (false)
  1181.     split_function = None
  1182.     if isinstance(value, int) or isinstance(value, float):  # ako vrednosta so koja sporeduvame e od tip int ili float
  1183.         # split_function=lambda row:row[column]>=value # togas vrati funkcija cij argument e row i vrakja vrednost true ili false
  1184.         split_function = sporedi_broj
  1185.     else:
  1186.         # split_function=lambda row:row[column]==value # ako vrednosta so koja sporeduvame e od drug tip (string)
  1187.         split_function = sporedi_string
  1188.  
  1189.     # Divide the rows into two sets and return them
  1190.     set_false = []
  1191.     set_true = []
  1192.     for row in rows:
  1193.         if split_function(row, column, value):
  1194.             set_true.append(row)
  1195.         else:
  1196.             set_false.append(row)
  1197.     set1 = [row for row in rows if
  1198.             split_function(row, column, value)]  # za sekoj row od rows za koj split_function vrakja true
  1199.     set2 = [row for row in rows if
  1200.             not split_function(row, column, value)]  # za sekoj row od rows za koj split_function vrakja false
  1201.     # return (set1, set2)
  1202.     return (set_true, set_false)
  1203.  
  1204.  
  1205.  
  1206.  
  1207. # Create counts of possible results (the last column of
  1208. # each row is the result)
  1209. def uniquecounts(rows):
  1210.     results = {}
  1211.     for row in rows:
  1212.         # The result is the last column
  1213.         r = row[-1]
  1214.         results.setdefault(r, 0)
  1215.         results[r] += 1
  1216.  
  1217.     return results
  1218.  
  1219.  
  1220. # Probability that a randomly placed item will
  1221. # be in the wrong category
  1222.  
  1223. def log2(x):
  1224.     from math import log
  1225.     l2 = log(x) / log(2)
  1226.     return l2
  1227.  
  1228.  
  1229. # Entropy is the sum of p(x)log(p(x)) across all
  1230. # the different possible results
  1231. def entropy(rows):
  1232.     results = uniquecounts(rows)
  1233.     # Now calculate the entropy
  1234.     ent = 0.0
  1235.     for r in results.keys():
  1236.         p = float(results[r]) / len(rows)
  1237.         ent = ent - p * log2(p)
  1238.     return ent
  1239.  
  1240.  
  1241. def buildtree(rows, scoref=entropy):
  1242.     if len(rows) == 0: return decisionnode()
  1243.     current_score = scoref(rows)
  1244.  
  1245.     # Set up some variables to track the best criteria
  1246.     best_gain = 0.0
  1247.     best_column = -1
  1248.     best_value = None
  1249.     best_subsetf = None
  1250.     best_subsett = None
  1251.  
  1252.     column_count = len(rows[0]) - 1
  1253.     for col in range(column_count):
  1254.         # Generate the list of different values in
  1255.         # this column
  1256.         column_values = set()
  1257.         for row in rows:
  1258.             column_values.add(row[col])
  1259.         # Now try dividing the rows up for each value
  1260.         # in this column
  1261.         for value in column_values:
  1262.             (set1, set2) = divideset(rows, col, value)
  1263.  
  1264.             # Information gain
  1265.             p = float(len(set1)) / len(rows)
  1266.             gain = current_score - p * scoref(set1) - (1 - p) * scoref(set2)
  1267.             if gain > best_gain and len(set1) > 0 and len(set2) > 0:
  1268.                 best_gain = gain
  1269.                 best_column = col
  1270.                 best_value = value
  1271.                 best_subsett = set1
  1272.                 best_subsetf = set2
  1273.                 # best_criteria = (col, value)
  1274.                 # best_sets = (set1, set2)
  1275.  
  1276.     # Create the subbranches
  1277.     if best_gain > 0:
  1278.         trueBranch = buildtree(best_subsett, scoref)
  1279.         falseBranch = buildtree(best_subsetf, scoref)
  1280.         return decisionnode(col=best_column, value=best_value,
  1281.                             tb=trueBranch, fb=falseBranch)
  1282.     else:
  1283.         return decisionnode(results=uniquecounts(rows))
  1284.  
  1285.  
  1286.  
  1287. def printtree(tree, indent=''):
  1288.     # Is this a leaf node?
  1289.     if tree.results != None:
  1290.         print(indent + str(sorted(tree.results.items())))
  1291.     else:
  1292.         # Print the criteria
  1293.         print(indent + str(tree.col) + ':' + str(tree.value) + '? ')
  1294.         # Print the branches
  1295.         print(indent + 'T->')
  1296.         printtree(tree.tb, indent + '  ')
  1297.         print(indent + 'F->')
  1298.         printtree(tree.fb, indent + '  ')
  1299.  
  1300.  
  1301.  
  1302.  
  1303. def classify(observation, tree):
  1304.     if tree.results != None:
  1305.         return tree.results
  1306.     else:
  1307.         vrednost = observation[tree.col]
  1308.         branch = None
  1309.  
  1310.         if isinstance(vrednost, int) or isinstance(vrednost, float):
  1311.             if vrednost >= tree.value:
  1312.                 branch = tree.tb
  1313.             else:
  1314.                 branch = tree.fb
  1315.         else:
  1316.             if vrednost == tree.value:
  1317.                 branch = tree.tb
  1318.             else:
  1319.                 branch = tree.fb
  1320.  
  1321.         return classify(observation, branch)
  1322.  
  1323.  
  1324.  
  1325. def new_data_set_fu(data):
  1326.     classes = {}
  1327.     for i in data:
  1328.         # print(i)
  1329.         classes.setdefault(i[-1])
  1330.  
  1331.     classes_data = list(classes)
  1332.  
  1333.     flag = 0
  1334.     tmp = 0
  1335.     data_set = []
  1336.     for i in data:
  1337.  
  1338.         if flag == 5:
  1339.             flag = 0
  1340.             tmp += 1
  1341.             if tmp == classes_data[-1]:
  1342.                 break
  1343.         elif classes_data[tmp] == i[-1]:
  1344.             data_set.append(i)
  1345.             # print(i)
  1346.             flag += 1
  1347.  
  1348.     return data_set
  1349.  
  1350. if __name__ == '__main__':
  1351.  
  1352.     my_index = 5
  1353.  
  1354.     data_set = new_data_set_fu(data)
  1355.     t = buildtree(data_set)
  1356.  
  1357.     #printtree(t)
  1358.  
  1359.     c = classify(data[my_index],t)
  1360.  
  1361.     print(c)
  1362. ----------------------------------------------------------------------------------------------------
  1363. Drva ispitna
  1364. """
  1365.  
  1366. Дрва на одлука (100 поени) Problem 2 (0 / 16)
  1367. Да се промени алгоритмот за дрво на одлука така што ќе се изградат две дрва на одлука. Секое од дрвата го користи половина од податочното множество.
  1368.  
  1369. Да се промени начинот на печатење на дрвото така што покрај секој јазол, ќе се испечати и неговото ниво.
  1370.  
  1371. Двете дрва да се испечатат и потоа да се испечати резултатот од класификацијата.
  1372.  
  1373. Доколку некое од дрвата има само една класа тогаш се предвидува истата, но ако има повеќе од една треба да се избере таа со најголем број на инстанци. Ако во листот има неколку класи со ист број на инстанци да се предвиде првата класа по азбучен ред.
  1374.  
  1375. Доколку двете дрва ја предвидат истата класа да се испечати класата. Во спротивно да се испечати KONTRADIKCIJA.
  1376.  
  1377.  
  1378.  
  1379. """
  1380.  
  1381. trainingData = [
  1382.     [6.3, 2.9, 5.6, 1.8, 'I. virginica'],
  1383.     [6.5, 3.0, 5.8, 2.2, 'I. virginica'],
  1384.     [7.6, 3.0, 6.6, 2.1, 'I. virginica'],
  1385.     [4.9, 2.5, 4.5, 1.7, 'I. virginica'],
  1386.     [7.3, 2.9, 6.3, 1.8, 'I. virginica'],
  1387.     [6.7, 2.5, 5.8, 1.8, 'I. virginica'],
  1388.     [7.2, 3.6, 6.1, 2.5, 'I. virginica'],
  1389.     [6.5, 3.2, 5.1, 2.0, 'I. virginica'],
  1390.     [6.4, 2.7, 5.3, 1.9, 'I. virginica'],
  1391.     [6.8, 3.0, 5.5, 2.1, 'I. virginica'],
  1392.     [5.7, 2.5, 5.0, 2.0, 'I. virginica'],
  1393.     [5.8, 2.8, 5.1, 2.4, 'I. virginica'],
  1394.     [6.4, 3.2, 5.3, 2.3, 'I. virginica'],
  1395.     [6.5, 3.0, 5.5, 1.8, 'I. virginica'],
  1396.     [7.7, 3.8, 6.7, 2.2, 'I. virginica'],
  1397.     [7.7, 2.6, 6.9, 2.3, 'I. virginica'],
  1398.     [6.0, 2.2, 5.0, 1.5, 'I. virginica'],
  1399.     [6.9, 3.2, 5.7, 2.3, 'I. virginica'],
  1400.     [5.6, 2.8, 4.9, 2.0, 'I. virginica'],
  1401.     [7.7, 2.8, 6.7, 2.0, 'I. virginica'],
  1402.     [6.3, 2.7, 4.9, 1.8, 'I. virginica'],
  1403.     [6.7, 3.3, 5.7, 2.1, 'I. virginica'],
  1404.     [7.2, 3.2, 6.0, 1.8, 'I. virginica'],
  1405.     [6.2, 2.8, 4.8, 1.8, 'I. virginica'],
  1406.     [6.1, 3.0, 4.9, 1.8, 'I. virginica'],
  1407.     [6.4, 2.8, 5.6, 2.1, 'I. virginica'],
  1408.     [7.2, 3.0, 5.8, 1.6, 'I. virginica'],
  1409.     [7.4, 2.8, 6.1, 1.9, 'I. virginica'],
  1410.     [7.9, 3.8, 6.4, 2.0, 'I. virginica'],
  1411.     [6.4, 2.8, 5.6, 2.2, 'I. virginica'],
  1412.     [6.3, 2.8, 5.1, 1.5, 'I. virginica'],
  1413.     [6.1, 2.6, 5.6, 1.4, 'I. virginica'],
  1414.     [7.7, 3.0, 6.1, 2.3, 'I. virginica'],
  1415.     [6.3, 3.4, 5.6, 2.4, 'I. virginica'],
  1416.     [5.1, 3.5, 1.4, 0.2, 'I. setosa'],
  1417.     [4.9, 3.0, 1.4, 0.2, 'I. setosa'],
  1418.     [4.7, 3.2, 1.3, 0.2, 'I. setosa'],
  1419.     [4.6, 3.1, 1.5, 0.2, 'I. setosa'],
  1420.     [5.0, 3.6, 1.4, 0.2, 'I. setosa'],
  1421.     [5.4, 3.9, 1.7, 0.4, 'I. setosa'],
  1422.     [4.6, 3.4, 1.4, 0.3, 'I. setosa'],
  1423.     [5.0, 3.4, 1.5, 0.2, 'I. setosa'],
  1424.     [4.4, 2.9, 1.4, 0.2, 'I. setosa'],
  1425.     [4.9, 3.1, 1.5, 0.1, 'I. setosa'],
  1426.     [5.4, 3.7, 1.5, 0.2, 'I. setosa'],
  1427.     [4.8, 3.4, 1.6, 0.2, 'I. setosa'],
  1428.     [4.8, 3.0, 1.4, 0.1, 'I. setosa'],
  1429.     [4.3, 3.0, 1.1, 0.1, 'I. setosa'],
  1430.     [5.8, 4.0, 1.2, 0.2, 'I. setosa'],
  1431.     [5.7, 4.4, 1.5, 0.4, 'I. setosa'],
  1432.     [5.4, 3.9, 1.3, 0.4, 'I. setosa'],
  1433.     [5.1, 3.5, 1.4, 0.3, 'I. setosa'],
  1434.     [5.7, 3.8, 1.7, 0.3, 'I. setosa'],
  1435.     [5.1, 3.8, 1.5, 0.3, 'I. setosa'],
  1436.     [5.4, 3.4, 1.7, 0.2, 'I. setosa'],
  1437.     [5.1, 3.7, 1.5, 0.4, 'I. setosa'],
  1438.     [4.6, 3.6, 1.0, 0.2, 'I. setosa'],
  1439.     [5.1, 3.3, 1.7, 0.5, 'I. setosa'],
  1440.     [4.8, 3.4, 1.9, 0.2, 'I. setosa'],
  1441.     [5.0, 3.0, 1.6, 0.2, 'I. setosa'],
  1442.     [5.0, 3.4, 1.6, 0.4, 'I. setosa'],
  1443.     [5.2, 3.5, 1.5, 0.2, 'I. setosa'],
  1444.     [5.2, 3.4, 1.4, 0.2, 'I. setosa'],
  1445.     [5.5, 2.3, 4.0, 1.3, 'I. versicolor'],
  1446.     [6.5, 2.8, 4.6, 1.5, 'I. versicolor'],
  1447.     [5.7, 2.8, 4.5, 1.3, 'I. versicolor'],
  1448.     [6.3, 3.3, 4.7, 1.6, 'I. versicolor'],
  1449.     [4.9, 2.4, 3.3, 1.0, 'I. versicolor'],
  1450.     [6.6, 2.9, 4.6, 1.3, 'I. versicolor'],
  1451.     [5.2, 2.7, 3.9, 1.4, 'I. versicolor'],
  1452.     [5.0, 2.0, 3.5, 1.0, 'I. versicolor'],
  1453.     [5.9, 3.0, 4.2, 1.5, 'I. versicolor'],
  1454.     [6.0, 2.2, 4.0, 1.0, 'I. versicolor'],
  1455.     [6.1, 2.9, 4.7, 1.4, 'I. versicolor'],
  1456.     [5.6, 2.9, 3.6, 1.3, 'I. versicolor'],
  1457.     [6.7, 3.1, 4.4, 1.4, 'I. versicolor'],
  1458.     [5.6, 3.0, 4.5, 1.5, 'I. versicolor'],
  1459.     [5.8, 2.7, 4.1, 1.0, 'I. versicolor'],
  1460.     [6.2, 2.2, 4.5, 1.5, 'I. versicolor'],
  1461.     [5.6, 2.5, 3.9, 1.1, 'I. versicolor'],
  1462.     [5.9, 3.2, 4.8, 1.8, 'I. versicolor'],
  1463.     [6.1, 2.8, 4.0, 1.3, 'I. versicolor'],
  1464.     [6.3, 2.5, 4.9, 1.5, 'I. versicolor'],
  1465.     [6.1, 2.8, 4.7, 1.2, 'I. versicolor'],
  1466.     [6.4, 2.9, 4.3, 1.3, 'I. versicolor'],
  1467.     [6.6, 3.0, 4.4, 1.4, 'I. versicolor'],
  1468.     [6.8, 2.8, 4.8, 1.4, 'I. versicolor'],
  1469.     [6.7, 3.0, 5.0, 1.7, 'I. versicolor'],
  1470.     [6.0, 2.9, 4.5, 1.5, 'I. versicolor'],
  1471.     [5.7, 2.6, 3.5, 1.0, 'I. versicolor'],
  1472.     [5.5, 2.4, 3.8, 1.1, 'I. versicolor'],
  1473.     [5.5, 2.4, 3.7, 1.0, 'I. versicolor'],
  1474.     [5.8, 2.7, 3.9, 1.2, 'I. versicolor'],
  1475.     [6.0, 2.7, 5.1, 1.6, 'I. versicolor'],
  1476.     [5.4, 3.0, 4.5, 1.5, 'I. versicolor'],
  1477.     [6.0, 3.4, 4.5, 1.6, 'I. versicolor'],
  1478.     [6.7, 3.1, 4.7, 1.5, 'I. versicolor'],
  1479.     [6.3, 2.3, 4.4, 1.3, 'I. versicolor'],
  1480.     [5.6, 3.0, 4.1, 1.3, 'I. versicolor'],
  1481.     [5.5, 2.5, 4.0, 1.3, 'I. versicolor'],
  1482.     [5.5, 2.6, 4.4, 1.2, 'I. versicolor'],
  1483.     [6.1, 3.0, 4.6, 1.4, 'I. versicolor'],
  1484.     [5.8, 2.6, 4.0, 1.2, 'I. versicolor'],
  1485.     [5.0, 2.3, 3.3, 1.0, 'I. versicolor'],
  1486.     [5.6, 2.7, 4.2, 1.3, 'I. versicolor'],
  1487.     [5.7, 3.0, 4.2, 1.2, 'I. versicolor'],
  1488.     [5.7, 2.9, 4.2, 1.3, 'I. versicolor'],
  1489.     [6.2, 2.9, 4.3, 1.3, 'I. versicolor'],
  1490.     [5.1, 2.5, 3.0, 1.1, 'I. versicolor'],
  1491.     [5.7, 2.8, 4.1, 1.3, 'I. versicolor'],
  1492.     [6.4, 3.1, 5.5, 1.8, 'I. virginica'],
  1493.     [6.0, 3.0, 4.8, 1.8, 'I. virginica'],
  1494.     [6.9, 3.1, 5.4, 2.1, 'I. virginica'],
  1495.     [6.7, 3.1, 5.6, 2.4, 'I. virginica'],
  1496.     [6.9, 3.1, 5.1, 2.3, 'I. virginica'],
  1497.     [5.8, 2.7, 5.1, 1.9, 'I. virginica'],
  1498.     [6.8, 3.2, 5.9, 2.3, 'I. virginica'],
  1499.     [6.7, 3.3, 5.7, 2.5, 'I. virginica'],
  1500.     [6.7, 3.0, 5.2, 2.3, 'I. virginica'],
  1501.     [6.3, 2.5, 5.0, 1.9, 'I. virginica'],
  1502.     [6.5, 3.0, 5.2, 2.0, 'I. virginica'],
  1503.     [6.2, 3.4, 5.4, 2.3, 'I. virginica'],
  1504.     [4.7, 3.2, 1.6, 0.2, 'I. setosa'],
  1505.     [4.8, 3.1, 1.6, 0.2, 'I. setosa'],
  1506.     [5.4, 3.4, 1.5, 0.4, 'I. setosa'],
  1507.     [5.2, 4.1, 1.5, 0.1, 'I. setosa'],
  1508.     [5.5, 4.2, 1.4, 0.2, 'I. setosa'],
  1509.     [4.9, 3.1, 1.5, 0.2, 'I. setosa'],
  1510.     [5.0, 3.2, 1.2, 0.2, 'I. setosa'],
  1511.     [5.5, 3.5, 1.3, 0.2, 'I. setosa'],
  1512.     [4.9, 3.6, 1.4, 0.1, 'I. setosa'],
  1513.     [4.4, 3.0, 1.3, 0.2, 'I. setosa'],
  1514.     [5.1, 3.4, 1.5, 0.2, 'I. setosa'],
  1515.     [5.0, 3.5, 1.3, 0.3, 'I. setosa'],
  1516.     [4.5, 2.3, 1.3, 0.3, 'I. setosa'],
  1517.     [4.4, 3.2, 1.3, 0.2, 'I. setosa'],
  1518.     [5.0, 3.5, 1.6, 0.6, 'I. setosa'],
  1519.     [5.1, 3.8, 1.9, 0.4, 'I. setosa'],
  1520.     [4.8, 3.0, 1.4, 0.3, 'I. setosa'],
  1521.     [5.1, 3.8, 1.6, 0.2, 'I. setosa'],
  1522.     [5.9, 3.0, 5.1, 1.8, 'I. virginica']
  1523. ]
  1524.  
  1525.  
  1526. class decisionnode:
  1527.     def __init__(self, col=-1, value=None, results=None, tb=None, fb=None,l=None):
  1528.         self.col = col
  1529.         self.value = value
  1530.         self.results = results
  1531.         self.tb = tb
  1532.         self.fb = fb
  1533.         self.l = l
  1534.  
  1535.  
  1536. def sporedi_broj(row, column, value):
  1537.     return row[column] >= value
  1538.  
  1539.  
  1540. def sporedi_string(row, column, value):
  1541.     return row[column] == value
  1542.  
  1543.  
  1544. # Divides a set on a specific column. Can handle numeric
  1545. # or nominal values
  1546. def divideset(rows, column, value):
  1547.     # Make a function that tells us if a row is in
  1548.     # the first group (true) or the second group (false)
  1549.     split_function = None
  1550.     if isinstance(value, int) or isinstance(value, float):  # ako vrednosta so koja sporeduvame e od tip int ili float
  1551.         # split_function=lambda row:row[column]>=value # togas vrati funkcija cij argument e row i vrakja vrednost true ili false
  1552.         split_function = sporedi_broj
  1553.     else:
  1554.         # split_function=lambda row:row[column]==value # ako vrednosta so koja sporeduvame e od drug tip (string)
  1555.         split_function = sporedi_string
  1556.  
  1557.     # Divide the rows into two sets and return them
  1558.     set_false = []
  1559.     set_true = []
  1560.     for row in rows:
  1561.         if split_function(row, column, value):
  1562.             set_true.append(row)
  1563.         else:
  1564.             set_false.append(row)
  1565.     set1 = [row for row in rows if
  1566.             split_function(row, column, value)]  # za sekoj row od rows za koj split_function vrakja true
  1567.     set2 = [row for row in rows if
  1568.             not split_function(row, column, value)]  # za sekoj row od rows za koj split_function vrakja false
  1569.     # return (set1, set2)
  1570.     return (set_true, set_false)
  1571.  
  1572.  
  1573. #st, sf = divideset(my_data, 3, 20)
  1574. #print(sf)
  1575. #print(st)
  1576.  
  1577.  
  1578. # Create counts of possible results (the last column of
  1579. # each row is the result)
  1580. def uniquecounts(rows):
  1581.     results = {}
  1582.     for row in rows:
  1583.         # The result is the last column
  1584.         r = row[-1]
  1585.         results.setdefault(r, 0)
  1586.         results[r] += 1
  1587.  
  1588.     return results
  1589.  
  1590.  
  1591. #print(uniquecounts(my_data))
  1592. #print(uniquecounts(st))
  1593. #print(uniquecounts(sf))
  1594.  
  1595.  
  1596. # Probability that a randomly placed item will
  1597. # be in the wrong category
  1598.  
  1599. def log2(x):
  1600.     from math import log
  1601.     l2 = log(x) / log(2)
  1602.     return l2
  1603.  
  1604.  
  1605. # Entropy is the sum of p(x)log(p(x)) across all
  1606. # the different possible results
  1607. def entropy(rows):
  1608.     results = uniquecounts(rows)
  1609.     # Now calculate the entropy
  1610.     ent = 0.0
  1611.     for r in results.keys():
  1612.         p = float(results[r]) / len(rows)
  1613.         ent = ent - p * log2(p)
  1614.     return ent
  1615.  
  1616.  
  1617. #print(entropy(my_data), entropy(st), entropy(sf))
  1618.  
  1619.  
  1620. def buildtree(rows,l=-1, scoref=entropy):
  1621.     if len(rows) == 0: return decisionnode()
  1622.     current_score = scoref(rows)
  1623.  
  1624.     # Set up some variables to track the best criteria
  1625.     best_gain = 0.0
  1626.     best_column = -1
  1627.     best_value = None
  1628.     best_subsetf = None
  1629.     best_subsett = None
  1630.  
  1631.     column_count = len(rows[0]) - 1
  1632.     for col in range(column_count):
  1633.         # Generate the list of different values in
  1634.         # this column
  1635.         column_values = set()
  1636.         for row in rows:
  1637.             column_values.add(row[col])
  1638.         # Now try dividing the rows up for each value
  1639.         # in this column
  1640.         for value in column_values:
  1641.             (set1, set2) = divideset(rows, col, value)
  1642.  
  1643.             # Information gain
  1644.             p = float(len(set1)) / len(rows)
  1645.             gain = current_score - p * scoref(set1) - (1 - p) * scoref(set2)
  1646.             if gain > best_gain and len(set1) > 0 and len(set2) > 0:
  1647.                 best_gain = gain
  1648.                 best_column = col
  1649.                 best_value = value
  1650.                 best_subsett = set1
  1651.                 best_subsetf = set2
  1652.                 # best_criteria = (col, value)
  1653.                 # best_sets = (set1, set2)
  1654.  
  1655.     # Create the subbranches
  1656.     if best_gain > 0:
  1657.         l = l + 1
  1658.         trueBranch = buildtree(best_subsett,l, scoref)
  1659.         falseBranch = buildtree(best_subsetf,l, scoref)
  1660.         return decisionnode(col=best_column, value=best_value,
  1661.                             tb=trueBranch, fb=falseBranch,l = l)
  1662.     else:
  1663.         return decisionnode(results=uniquecounts(rows))
  1664.  
  1665.  
  1666. def printtree(tree, indent=''):
  1667.     # Is this a leaf node?
  1668.     if tree.results != None:
  1669.         print str(sorted(tree.results.items()))
  1670.     else:
  1671.         # Print the criteria
  1672.         print( str(tree.col) + ':' + str(tree.value) + '? '+'Level='+str(tree.l))
  1673.         # Print the branches
  1674.         print(indent + 'T->'),
  1675.         printtree(tree.tb, indent + '  ')
  1676.         print(indent + 'F->'),
  1677.         printtree(tree.fb, indent + '  ')
  1678.  
  1679.  
  1680. def classify(observation, tree):
  1681.     if tree.results != None:
  1682.  
  1683.          maxi=0
  1684.         # print tree.results
  1685.         #for k in tree.results:
  1686.         #    if tree.results[k]>=maxi:
  1687.         #        maxi=tree.results[k]
  1688.         #for k in tree.results:
  1689.         #    if tree.results[k] == maxi:
  1690.         #        lista.append(k)
  1691.         #lista.sort()
  1692.         #return lista[0]
  1693.         return tree.results
  1694.     else:
  1695.         vrednost = observation[tree.col]
  1696.         branch = None
  1697.  
  1698.         if isinstance(vrednost, int) or isinstance(vrednost, float):
  1699.             if vrednost >= tree.value:
  1700.                 branch = tree.tb
  1701.             else:
  1702.                 branch = tree.fb
  1703.         else:
  1704.             if vrednost == tree.value:
  1705.                 branch = tree.tb
  1706.             else:
  1707.                 branch = tree.fb
  1708.  
  1709.         return classify(observation, branch)
  1710.  
  1711.  
  1712.  
  1713. if __name__ == '__main__':
  1714.  
  1715.     arg1 = 1
  1716.     arg2 = 2.2
  1717.     arg3 = 4.0
  1718.     arg4 = 1.1
  1719.     cl = 'I. virginica'
  1720.  
  1721.     tmp = [arg1,arg2,arg3,arg4,cl]
  1722.  
  1723.     p1 = []
  1724.     p2  = []
  1725.     leng = len(trainingData)
  1726.  
  1727.     for i in range(0,leng/2):
  1728.         p1.append(trainingData[i])
  1729.  
  1730.     for i in range(leng/2,len(trainingData)):
  1731.         p2.append(trainingData[i])
  1732.  
  1733.     d1 = buildtree(p1)
  1734.     d2 = buildtree(p2)
  1735.  
  1736.     print 'DRVO 1\n',printtree(d1)
  1737.     print 'DRVO 2\n', printtree(d2)
  1738.  
  1739.     k1 = classify(tmp,d1)
  1740.     k2 = classify(tmp,d2)
  1741.  
  1742.     print k1
  1743.     print k2
  1744.  
  1745.     if k1.keys()[0] == k2.keys()[0]:
  1746.         print k1.keys()[0]
  1747.     if(k1.values()[0]>k2.values()[0]):
  1748.         print k1.keys()[0]
  1749.         print 'KONTRADIKCIJA'
  1750.     if(k2.values()[0]>k1.values()[0]):
  1751.         print k2.keys()[0]
  1752.         print 'KONTRADIKCIJA'
  1753. ------------------------------------------------------------------------------------------------------
  1754. ------------------------------------------------------------------------------------------------------
  1755. ### Klasifikacija ###
  1756. """Задача 1 Problem 1 (2 / 3)
  1757. Дадено е тренинг множество од неколку документи. Притоа се знае секој документ од која класа е
  1758. (science или sport). Mножеството е претставено како листи од торки, така што во секоја торка
  1759. прв елемент е текстот на документот како стринг, а втор елемент е класата како стринг.
  1760. Да се истренира модел врз основа на тренинг множеството и потоа за секој документ
  1761. прочитан од стандарден влез да се испечати неговата класа.
  1762. """
  1763. #!/usr/bin/python
  1764. # -*- coding: utf-8 -*-
  1765.  
  1766. import re
  1767. import math
  1768.  
  1769. train_data = [
  1770.     ("""What Are We Searching for on Mars?
  1771. Martians terrified me growing up. I remember watching the 1996 movie Mars Attacks! and fearing that the Red Planet harbored hostile alien neighbors. Though I was only 6 at the time, I was convinced life on Mars meant little green men wielding vaporizer guns. There was a time, not so long ago, when such an assumption about Mars wouldn’t have seemed so far-fetched.
  1772. Like a child watching a scary movie, people freaked out after listening to “The War of the Worlds,” the now-infamous 1938 radio drama that many listeners believed was a real report about an invading Martian army. Before humans left Earth, humanity’s sense of what—or who—might be in our galactic neighborhood was, by today’s standards, remarkably optimistic.
  1773. """,
  1774.      "science"),
  1775.     ("""Mountains of Ice are Melting, But Don't Panic (Op-Ed)
  1776. If the planet lost the entire West Antarctic ice sheet, global sea level would rise 11 feet, threatening nearly 13 million people worldwide and affecting more than $2 trillion worth of property.
  1777. Ice loss from West Antarctica has been increasing nearly three times faster in the past decade than during the previous one — and much more quickly than scientists predicted.
  1778. This unprecedented ice loss is occurring because warm ocean water is rising from below and melting the base of the glaciers, dumping huge volumes of additional water — the equivalent of a Mt. Everest every two years — into the ocean.
  1779. """,
  1780.      "science"),
  1781.     ("""Some scientists think we'll find signs of aliens within our lifetimes. Here's how.
  1782. Finding extraterrestrial life is the essence of science fiction. But it's not so far-fetched to predict that we might find evidence of life on a distant planet within a generation.
  1783. "With new telescopes coming online within the next five or ten years, we'll really have a chance to figure out whether we're alone in the universe," says Lisa Kaltenegger, an astronomer and director of Cornell's new Institute for Pale Blue Dots, which will search for habitable planets. "For the first time in human history, we might have the capability to do this."
  1784. """,
  1785.      "science"),
  1786.     ("""'Magic' Mushrooms in Royal Garden: What Is Fly Agaric?
  1787. Hallucinogenic mushrooms are perhaps the last thing you'd expect to find growing in the Queen of England's garden.
  1788. Yet a type of mushroom called Amanita muscaria — commonly known as fly agaric, or fly amanita — was found growing in the gardens of Buckingham Palace by the producers of a television show, the Associated Press reported on Friday (Dec. 12).
  1789. A. muscaria is a bright red-and-white mushroom, and the fungus is psychoactive when consumed.
  1790. """,
  1791.      "science"),
  1792.     ("""Upcoming Parks : 'Lost Corner' Finds New Life in Sandy Springs
  1793. At the corner of Brandon Mill Road, where Johnson Ferry Road turns into Dalrymple Road, tucked among 24 forested acres, sits an early 20th Century farmhouse. A vestige of Sandy Springs' past, the old home has found new life as the centerpiece of Lost Forest Preserve. While the preserve isn't slated to officially debut until some time next year, the city has opened the hiking trails to the public until construction begins on the permanent parking lot (at the moment the parking lot is a mulched area). The new park space includes community garden plots, a 4,000-foot-long hiking trail and an ADA-accessible trail through the densely wooded site. For Atlantans seeking an alternate escape to serenity (or those who dig local history), it's certainly worth a visit.
  1794. """,
  1795.      "science"),
  1796.     ("""Stargazers across the world got a treat this weekend when the Geminids meteor shower gave the best holiday displays a run for their money.
  1797. The meteor shower is called the "Geminids" because they appear as though they are shooting out of the constellation of Gemini. The meteors are thought to be small pieces of an extinct comment called 3200 Phaeton, a dust cloud revolving around the sun. Phaeton is thought to have lost all of its gas and to be slowly breaking apart into small particles.
  1798. Earth runs into a stream of debris from 3200 Phaethon every year in mid-December, causing a shower of meteors, which hit its peak over the weekend.
  1799. """,
  1800.      "science"),
  1801.     ("""Envisioning a River of Air
  1802. By the classification rules of the world of physics, we all know that the Earth's atmosphere is made of gas (rather than liquid, solid, or plasma). But in the world of flying it's often useful to think
  1803. """,
  1804.      "science"),
  1805.     ("""Following Sunday's 17-7 loss to the Seattle Seahawks, the San Francisco 49ers were officially eliminated from playoff contention, and they have referee Ed Hochuli to blame. OK, so they have a lot of folks to point the finger at for their 7-7 record, but Hochuli's incorrect call is the latest and easiest scapegoat.
  1806. """
  1807.      , "sport"),
  1808.     ("""Kobe Bryant and his teammates have an odd relationship. That makes sense: Kobe Bryant is an odd guy, and the Los Angeles Lakers are an odd team.
  1809. They’re also, for the first time this season, the proud owners of a three-game winning streak. On top of that, you may have heard, Kobe Bryant passed Michael Jordan on Sunday evening to move into third place on the NBA’s all-time scoring list.
  1810. """
  1811.      , "sport"),
  1812.     ("""The Patriots continued their divisional dominance and are close to clinching home-field advantage throughout the AFC playoffs. Meanwhile, both the Colts and Broncos again won their division titles with head-to-head wins.The Bills' upset of the Packers delivered a big blow to Green Bay's shot at clinching home-field advantage throughout the NFC playoffs. Detroit seized on the opportunity and now leads the NFC North.
  1813. """
  1814.      , "sport"),
  1815.     ("""If you thought the Washington Redskins secondary was humbled by another scintillating performance from New Yorks Giants rookie wide receiver sensation Odell Beckham Jr., think again.In what is becoming a weekly occurrence, Beckham led NFL highlight reels on Sunday, collecting 12 catches for 143 yards and three touchdowns in Sunday's 24-13 victory against an NFC East rival.
  1816. """
  1817.      , "sport")
  1818.     , ("""That was two touchdowns and 110 total yards for the three running backs. We break down the fantasy implications.The New England Patriots' rushing game has always been tough to handicap. Sunday, all three of the team's primary running backs put up numbers, and all in different ways, but it worked for the team, as the Patriots beat the Miami Dolphins, 41-13.
  1819. """
  1820.        , "sport"),
  1821.     ("""General Santos (Philippines) (AFP) - Philippine boxing legend Manny Pacquiao vowed to chase Floyd Mayweather into ring submission after his US rival offered to fight him next year in a blockbuster world title face-off. "He (Mayweather) has reached a dead end. He has nowhere to run but to fight me," Pacquiao told AFP late Saturday, hours after the undefeated Mayweather issued the May 2 challenge on US television. The two were long-time rivals as the "best pound-for-pound" boxers of their generation, but the dream fight has never materialised to the disappointment of the boxing world.
  1822. """
  1823.      , "sport"),
  1824.     ("""When St. John's landed Rysheed Jordan, the consensus was that he would be an excellent starter.
  1825. So far, that's half true.
  1826. Jordan came off the bench Sunday and tied a career high by scoring 24 points to lead No. 24 St. John's to a 74-53 rout of Fordham in the ECAC Holiday Festival.
  1827. ''I thought Rysheed played with poise,'' Red Storm coach Steve Lavin said. ''Played with the right pace. Near perfect game.''
  1828. """
  1829.      , "sport"),
  1830.     ("""Five-time world player of the year Marta scored three goals to lead Brazil to a 3-2 come-from-behind win over the U.S. women's soccer team in the International Tournament of Brasilia on Sunday. Carli Lloyd and Megan Rapinoe scored a goal each in the first 10 minutes to give the U.S. an early lead, but Marta netted in the 19th, 55th and 66th minutes to guarantee the hosts a spot in the final of the four-team competition.
  1831. """
  1832.      , "sport"),
  1833. ]
  1834.  
  1835.  
  1836. def getwords(doc):
  1837.     splitter = re.compile('\\W*')
  1838.     words = [word.lower() for word in splitter.split(doc) if len(word) > 2 and len(word) < 20]
  1839.  
  1840.     return dict([(word, 1) for word in words])
  1841.  
  1842.  
  1843. class documentClassifier:
  1844.     def __init__(self, getfeatures, filename=None):
  1845.         self.featureCountsPerCategory = {}
  1846.         self.categoryCounts = {}
  1847.         self.getfeatures = getfeatures
  1848.  
  1849.     def incrementFeatureCountsPerCategory(self, currentFeature, currentCategory):
  1850.         self.featureCountsPerCategory.setdefault(currentFeature, {})
  1851.         self.featureCountsPerCategory[currentFeature].setdefault(currentCategory, 0)
  1852.         self.featureCountsPerCategory[currentFeature][currentCategory] += 1
  1853.  
  1854.     def incrementCategoryCounts(self, cat):
  1855.         self.categoryCounts.setdefault(cat, 0)
  1856.         self.categoryCounts[cat] += 1
  1857.  
  1858.     def getFeatureCountsPerCategory(self, currentFeature, currentCategory):
  1859.         if currentFeature in self.featureCountsPerCategory and currentCategory in self.featureCountsPerCategory[
  1860.             currentFeature]:
  1861.             return float(self.featureCountsPerCategory[currentFeature][currentCategory])
  1862.         return 0.0
  1863.  
  1864.     def getCategoryCount(self, currentCategory):
  1865.         if currentCategory in self.categoryCounts:
  1866.             return float(self.categoryCounts[currentCategory])
  1867.         return 0
  1868.  
  1869.     def getTotal(self):
  1870.         return sum(self.categoryCounts.values())
  1871.  
  1872.     def categories(self):
  1873.         return self.categoryCounts.keys()
  1874.  
  1875.     def train(self, item, currentCategory):
  1876.         features = self.getfeatures(item)
  1877.         for currentFeature in features:
  1878.             self.incrementFeatureCountsPerCategory(currentFeature, currentCategory)
  1879.         self.incrementCategoryCounts(currentCategory)
  1880.  
  1881.     def getFeaturePerCategoryProbability(self, currentFeature, currentCategory):
  1882.         if self.getCategoryCount(currentCategory) == 0: return 0
  1883.         return self.getFeatureCountsPerCategory(currentFeature, currentCategory) / self.getCategoryCount(
  1884.             currentCategory)
  1885.  
  1886.     def weightedprob(self, currentFeature, currentCategory, prf, weight=1.0, ap=0.5):
  1887.         basicprob = prf(currentFeature, currentCategory)
  1888.         totals = sum([self.getFeatureCountsPerCategory(currentFeature, currentCategory) for currentCategory in
  1889.                       self.categories()])
  1890.         bp = ((weight * ap) + (totals * basicprob)) / (weight + totals)
  1891.         return bp
  1892.  
  1893.  
  1894. class naivebayes(documentClassifier):
  1895.     def __init__(self, getfeatures):
  1896.         documentClassifier.__init__(self, getfeatures)
  1897.         self.thresholds = {}
  1898.  
  1899.     def setThreshold(self, currentCategory, threshold):
  1900.         self.thresholds[currentCategory] = threshold
  1901.  
  1902.     def getThreshold(self, currentCategory):
  1903.         if currentCategory not in self.thresholds: return 1.0
  1904.         return self.thresholds[currentCategory]
  1905.  
  1906.     def calculateDocumentProbabilityInClass(self, item, currentCategory):
  1907.         features = self.getfeatures(item)
  1908.         p = 1
  1909.         for currentFeature in features:
  1910.             p *= self.weightedprob(currentFeature, currentCategory, self.getFeaturePerCategoryProbability)
  1911.  
  1912.         return p
  1913.  
  1914.     def getCategoryProbabilityForDocument(self, item, currentCategory):
  1915.         catprob = self.getCategoryCount(currentCategory) / self.getTotal()
  1916.         calculateDocumentProbabilityInClass = self.calculateDocumentProbabilityInClass(item, currentCategory)
  1917.  
  1918.         return calculateDocumentProbabilityInClass * catprob / (1.0 / self.getTotal())
  1919.  
  1920.     def classifyDocument(self, item, default=None):
  1921.         probs = {}
  1922.         max = 0.0
  1923.  
  1924.         for cat in self.categories():
  1925.             probs[cat] = self.getCategoryProbabilityForDocument(item, cat)
  1926.             if probs[cat] > max:
  1927.                 max = probs[cat]
  1928.                 best = cat
  1929.  
  1930.         for cat in probs:
  1931.             if cat == best: continue
  1932.             if probs[cat] * self.getThreshold(best) > probs[best]: return default
  1933.         return best
  1934.  
  1935.  
  1936. def trainClassifier(cl, data):
  1937.     cl.train("""So far this season Chelsea have looked the class of the league, but that does not faze Rooney.""",
  1938.              "science")
  1939.     cl.train("""Particularly for the strikers whose hype and profiles eclipse their prowess in front of goal. ""","sport")
  1940.     cl.train("""Armed with a bachelors degree in botany and a masters in microbiology, Hoffman moved to Charlotte where she became director of the Charlotte Nature Museum. ""","science")
  1941.     cl.train("""Chicago Bears quarterback Jay Cutler sits down for his usual press conference, but doesn't stay long as the media room has yet to be filled with most of the local media members.""","sport")
  1942.     cl.train("""When astronauts return from the International Space Station, their capsule hits the atmosphere at a speed of more than 17,000 miles per hour.""","science")
  1943.  
  1944.  
  1945. if __name__ == "__main__":
  1946.     cl = naivebayes(getwords)
  1947.     trainClassifier(cl, train_data)
  1948.     recenica = input()
  1949.     # klasa = 'bad'
  1950.     # print klasa
  1951.     print cl.classifyDocument(recenica)
  1952. -----------------------------------------------------------------------------------------------------
  1953. Klasifikacija lab2
  1954.  
  1955. #!/usr/bin/python
  1956. # -*- coding: utf-8 -*-
  1957.  
  1958. import re
  1959. import math
  1960.  
  1961. train_data = [
  1962.     ("""What Are We Searching for on Mars?
  1963. Martians terrified me growing up. I remember watching the 1996 movie Mars Attacks! and fearing that the Red Planet harbored hostile alien neighbors. Though I was only 6 at the time, I was convinced life on Mars meant little green men wielding vaporizer guns. There was a time, not so long ago, when such an assumption about Mars wouldn’t have seemed so far-fetched.
  1964. Like a child watching a scary movie, people freaked out after listening to “The War of the Worlds,” the now-infamous 1938 radio drama that many listeners believed was a real report about an invading Martian army. Before humans left Earth, humanity’s sense of what—or who—might be in our galactic neighborhood was, by today’s standards, remarkably optimistic.
  1965. """,
  1966.      "science"),
  1967.     ("""Mountains of Ice are Melting, But Don't Panic (Op-Ed)
  1968. If the planet lost the entire West Antarctic ice sheet, global sea level would rise 11 feet, threatening nearly 13 million people worldwide and affecting more than $2 trillion worth of property.
  1969. Ice loss from West Antarctica has been increasing nearly three times faster in the past decade than during the previous one — and much more quickly than scientists predicted.
  1970. This unprecedented ice loss is occurring because warm ocean water is rising from below and melting the base of the glaciers, dumping huge volumes of additional water — the equivalent of a Mt. Everest every two years — into the ocean.
  1971. """,
  1972.      "science"),
  1973.     ("""Some scientists think we'll find signs of aliens within our lifetimes. Here's how.
  1974. Finding extraterrestrial life is the essence of science fiction. But it's not so far-fetched to predict that we might find evidence of life on a distant planet within a generation.
  1975. "With new telescopes coming online within the next five or ten years, we'll really have a chance to figure out whether we're alone in the universe," says Lisa Kaltenegger, an astronomer and director of Cornell's new Institute for Pale Blue Dots, which will search for habitable planets. "For the first time in human history, we might have the capability to do this."
  1976. """,
  1977.      "science"),
  1978.     ("""'Magic' Mushrooms in Royal Garden: What Is Fly Agaric?
  1979. Hallucinogenic mushrooms are perhaps the last thing you'd expect to find growing in the Queen of England's garden.
  1980. Yet a type of mushroom called Amanita muscaria — commonly known as fly agaric, or fly amanita — was found growing in the gardens of Buckingham Palace by the producers of a television show, the Associated Press reported on Friday (Dec. 12).
  1981. A. muscaria is a bright red-and-white mushroom, and the fungus is psychoactive when consumed.
  1982. """,
  1983.      "science"),
  1984.     ("""Upcoming Parks : 'Lost Corner' Finds New Life in Sandy Springs
  1985. At the corner of Brandon Mill Road, where Johnson Ferry Road turns into Dalrymple Road, tucked among 24 forested acres, sits an early 20th Century farmhouse. A vestige of Sandy Springs' past, the old home has found new life as the centerpiece of Lost Forest Preserve. While the preserve isn't slated to officially debut until some time next year, the city has opened the hiking trails to the public until construction begins on the permanent parking lot (at the moment the parking lot is a mulched area). The new park space includes community garden plots, a 4,000-foot-long hiking trail and an ADA-accessible trail through the densely wooded site. For Atlantans seeking an alternate escape to serenity (or those who dig local history), it's certainly worth a visit.
  1986. """,
  1987.      "science"),
  1988.     ("""Stargazers across the world got a treat this weekend when the Geminids meteor shower gave the best holiday displays a run for their money.
  1989. The meteor shower is called the "Geminids" because they appear as though they are shooting out of the constellation of Gemini. The meteors are thought to be small pieces of an extinct comment called 3200 Phaeton, a dust cloud revolving around the sun. Phaeton is thought to have lost all of its gas and to be slowly breaking apart into small particles.
  1990. Earth runs into a stream of debris from 3200 Phaethon every year in mid-December, causing a shower of meteors, which hit its peak over the weekend.
  1991. """,
  1992.      "science"),
  1993.     ("""Envisioning a River of Air
  1994. By the classification rules of the world of physics, we all know that the Earth's atmosphere is made of gas (rather than liquid, solid, or plasma). But in the world of flying it's often useful to think
  1995. """,
  1996.      "science"),
  1997.     ("""Following Sunday's 17-7 loss to the Seattle Seahawks, the San Francisco 49ers were officially eliminated from playoff contention, and they have referee Ed Hochuli to blame. OK, so they have a lot of folks to point the finger at for their 7-7 record, but Hochuli's incorrect call is the latest and easiest scapegoat.
  1998. """
  1999.      , "sport"),
  2000.     ("""Kobe Bryant and his teammates have an odd relationship. That makes sense: Kobe Bryant is an odd guy, and the Los Angeles Lakers are an odd team.
  2001. They’re also, for the first time this season, the proud owners of a three-game winning streak. On top of that, you may have heard, Kobe Bryant passed Michael Jordan on Sunday evening to move into third place on the NBA’s all-time scoring list.
  2002. """
  2003.      , "sport"),
  2004.     ("""The Patriots continued their divisional dominance and are close to clinching home-field advantage throughout the AFC playoffs. Meanwhile, both the Colts and Broncos again won their division titles with head-to-head wins.The Bills' upset of the Packers delivered a big blow to Green Bay's shot at clinching home-field advantage throughout the NFC playoffs. Detroit seized on the opportunity and now leads the NFC North.
  2005. """
  2006.      , "sport"),
  2007.     ("""If you thought the Washington Redskins secondary was humbled by another scintillating performance from New Yorks Giants rookie wide receiver sensation Odell Beckham Jr., think again.In what is becoming a weekly occurrence, Beckham led NFL highlight reels on Sunday, collecting 12 catches for 143 yards and three touchdowns in Sunday's 24-13 victory against an NFC East rival.
  2008. """
  2009.      , "sport")
  2010.     , ("""That was two touchdowns and 110 total yards for the three running backs. We break down the fantasy implications.The New England Patriots' rushing game has always been tough to handicap. Sunday, all three of the team's primary running backs put up numbers, and all in different ways, but it worked for the team, as the Patriots beat the Miami Dolphins, 41-13.
  2011. """
  2012.        , "sport"),
  2013.     ("""General Santos (Philippines) (AFP) - Philippine boxing legend Manny Pacquiao vowed to chase Floyd Mayweather into ring submission after his US rival offered to fight him next year in a blockbuster world title face-off. "He (Mayweather) has reached a dead end. He has nowhere to run but to fight me," Pacquiao told AFP late Saturday, hours after the undefeated Mayweather issued the May 2 challenge on US television. The two were long-time rivals as the "best pound-for-pound" boxers of their generation, but the dream fight has never materialised to the disappointment of the boxing world.
  2014. """
  2015.      , "sport"),
  2016.     ("""When St. John's landed Rysheed Jordan, the consensus was that he would be an excellent starter.
  2017. So far, that's half true.
  2018. Jordan came off the bench Sunday and tied a career high by scoring 24 points to lead No. 24 St. John's to a 74-53 rout of Fordham in the ECAC Holiday Festival.
  2019. ''I thought Rysheed played with poise,'' Red Storm coach Steve Lavin said. ''Played with the right pace. Near perfect game.''
  2020. """
  2021.      , "sport"),
  2022.     ("""Five-time world player of the year Marta scored three goals to lead Brazil to a 3-2 come-from-behind win over the U.S. women's soccer team in the International Tournament of Brasilia on Sunday. Carli Lloyd and Megan Rapinoe scored a goal each in the first 10 minutes to give the U.S. an early lead, but Marta netted in the 19th, 55th and 66th minutes to guarantee the hosts a spot in the final of the four-team competition.
  2023. """
  2024.      , "sport"),
  2025. ]
  2026.  
  2027.  
  2028. def getwords(doc):
  2029.     splitter = re.compile('\\W*')
  2030.     words = [word.lower() for word in splitter.split(doc) if len(word) > 2 and len(word) < 20]
  2031.  
  2032.     return dict([(word, 1) for word in words])
  2033.  
  2034.  
  2035. class documentClassifier:
  2036.     def __init__(self, getfeatures, filename=None):
  2037.         self.featureCountsPerCategory = {}
  2038.         self.categoryCounts = {}
  2039.         self.getfeatures = getfeatures
  2040.  
  2041.     def incrementFeatureCountsPerCategory(self, currentFeature, currentCategory):
  2042.         self.featureCountsPerCategory.setdefault(currentFeature, {})
  2043.         self.featureCountsPerCategory[currentFeature].setdefault(currentCategory, 0)
  2044.         self.featureCountsPerCategory[currentFeature][currentCategory] += 1
  2045.  
  2046.     def incrementCategoryCounts(self, cat):
  2047.         self.categoryCounts.setdefault(cat, 0)
  2048.         self.categoryCounts[cat] += 1
  2049.  
  2050.     def getFeatureCountsPerCategory(self, currentFeature, currentCategory):
  2051.         if currentFeature in self.featureCountsPerCategory and currentCategory in self.featureCountsPerCategory[
  2052.             currentFeature]:
  2053.             return float(self.featureCountsPerCategory[currentFeature][currentCategory])
  2054.         return 0.0
  2055.  
  2056.     def getCategoryCount(self, currentCategory):
  2057.         if currentCategory in self.categoryCounts:
  2058.             return float(self.categoryCounts[currentCategory])
  2059.         return 0
  2060.  
  2061.     def getTotal(self):
  2062.         return sum(self.categoryCounts.values())
  2063.  
  2064.     def categories(self):
  2065.         return self.categoryCounts.keys()
  2066.  
  2067.     def train(self, item, currentCategory):
  2068.         features = self.getfeatures(item)
  2069.         for currentFeature in features:
  2070.             self.incrementFeatureCountsPerCategory(currentFeature, currentCategory)
  2071.         self.incrementCategoryCounts(currentCategory)
  2072.  
  2073.     def getFeaturePerCategoryProbability(self, currentFeature, currentCategory):
  2074.         if self.getCategoryCount(currentCategory) == 0: return 0
  2075.         return self.getFeatureCountsPerCategory(currentFeature, currentCategory) / self.getCategoryCount(
  2076.             currentCategory)
  2077.  
  2078.     def weightedprob(self, currentFeature, currentCategory, prf, weight=1.0, ap=0.5):
  2079.         basicprob = prf(currentFeature, currentCategory)
  2080.         totals = sum([self.getFeatureCountsPerCategory(currentFeature, currentCategory) for currentCategory in
  2081.                       self.categories()])
  2082.         bp = ((weight * ap) + (totals * basicprob)) / (weight + totals)
  2083.         return bp
  2084.  
  2085.  
  2086. class naivebayes(documentClassifier):
  2087.     def __init__(self, getfeatures):
  2088.         documentClassifier.__init__(self, getfeatures)
  2089.         self.thresholds = {}
  2090.  
  2091.     def setThreshold(self, currentCategory, threshold):
  2092.         self.thresholds[currentCategory] = threshold
  2093.  
  2094.     def getThreshold(self, currentCategory):
  2095.         if currentCategory not in self.thresholds: return 1.0
  2096.         return self.thresholds[currentCategory]
  2097.  
  2098.     def calculateDocumentProbabilityInClass(self, item, currentCategory):
  2099.         features = self.getfeatures(item)
  2100.         p = 1
  2101.         for currentFeature in features:
  2102.             p *= self.weightedprob(currentFeature, currentCategory, self.getFeaturePerCategoryProbability)
  2103.  
  2104.         return p
  2105.  
  2106.     def getCategoryProbabilityForDocument(self, item, currentCategory):
  2107.         catprob = self.getCategoryCount(currentCategory) / self.getTotal()
  2108.         calculateDocumentProbabilityInClass = self.calculateDocumentProbabilityInClass(item, currentCategory)
  2109.  
  2110.         return calculateDocumentProbabilityInClass * catprob / (1.0 / self.getTotal())
  2111.  
  2112.     def classifyDocument(self, item, default=None):
  2113.         probs = {}
  2114.         max = 0.0
  2115.  
  2116.         for cat in self.categories():
  2117.             probs[cat] = self.getCategoryProbabilityForDocument(item, cat)
  2118.             if probs[cat] > max:
  2119.                 max = probs[cat]
  2120.                 best = cat
  2121.  
  2122.         for cat in probs:
  2123.             if cat == best: continue
  2124.             if probs[cat] * self.getThreshold(best) > probs[best]: return default
  2125.         return best
  2126.  
  2127.  
  2128. def trainClassifier(cl, data):
  2129.     #cl.train("""So far this season Chelsea have looked the class of the league, but that does not faze Rooney.""","science")
  2130.     #cl.train("""Particularly for the strikers whose hype and profiles eclipse their prowess in front of goal. ""","sport")
  2131.     #cl.train("""Armed with a bachelors degree in botany and a masters in microbiology, Hoffman moved to Charlotte where she became director of the Charlotte Nature Museum. ""","science")
  2132.     #cl.train("""Chicago Bears quarterback Jay Cutler sits down for his usual press conference, but doesn't stay long as the media room has yet to be filled with most of the local media members.""","sport")
  2133.     #cl.train("""When astronauts return from the International Space Station, their capsule hits the atmosphere at a speed of more than 17,000 miles per hour.""","science")
  2134.  
  2135.     for el in data:
  2136.         cl.train(el[0],el[1])
  2137.  
  2138.  
  2139.  
  2140. if __name__ == "__main__":
  2141.     cl = naivebayes(getwords)
  2142.     trainClassifier(cl, train_data)
  2143.     recenica = input()
  2144.     klasa = cl.classifyDocument(recenica)
  2145.     verojatnost=cl.getCategoryProbabilityForDocument(recenica,klasa)
  2146.  
  2147.     # print klasa
  2148.  
  2149.     print klasa, '%.8f'% verojatnost
  2150. -----------------------------------------------------------------------------------------------------
  2151. Klasifikacija 3
  2152. """
  2153. За секоја прочитан документ од стандарден влез да се испечатат зборовите кои се употребуваат
  2154. за класификација, класата и веројатноста на зборот да е од таа класа (заокружено на 4 децимали),
  2155. како и тежинската веројатност на зборот да припаѓа на класата (заокружено на 4 децимали).
  2156. На крај да се испечати предвидената класа на документот и логаритам со основа 2 од веројатноста
  2157. со која се предвидува (заокружено на 4 децимали).
  2158. """
  2159.  
  2160. #!/usr/bin/python
  2161. # -*- coding: utf-8 -*-
  2162.  
  2163. import re
  2164. import math
  2165.  
  2166. train_data = [
  2167.     ("""What Are We Searching for on Mars?
  2168. Martians terrified me growing up. I remember watching the 1996 movie Mars Attacks! and fearing that the Red Planet harbored hostile alien neighbors. Though I was only 6 at the time, I was convinced life on Mars meant little green men wielding vaporizer guns. There was a time, not so long ago, when such an assumption about Mars wouldn’t have seemed so far-fetched.
  2169. Like a child watching a scary movie, people freaked out after listening to “The War of the Worlds,” the now-infamous 1938 radio drama that many listeners believed was a real report about an invading Martian army. Before humans left Earth, humanity’s sense of what—or who—might be in our galactic neighborhood was, by today’s standards, remarkably optimistic.
  2170. """,
  2171.      "science"),
  2172.     ("""Mountains of Ice are Melting, But Don't Panic (Op-Ed)
  2173. If the planet lost the entire West Antarctic ice sheet, global sea level would rise 11 feet, threatening nearly 13 million people worldwide and affecting more than $2 trillion worth of property.
  2174. Ice loss from West Antarctica has been increasing nearly three times faster in the past decade than during the previous one — and much more quickly than scientists predicted.
  2175. This unprecedented ice loss is occurring because warm ocean water is rising from below and melting the base of the glaciers, dumping huge volumes of additional water — the equivalent of a Mt. Everest every two years — into the ocean.
  2176. """,
  2177.      "science"),
  2178.     ("""Some scientists think we'll find signs of aliens within our lifetimes. Here's how.
  2179. Finding extraterrestrial life is the essence of science fiction. But it's not so far-fetched to predict that we might find evidence of life on a distant planet within a generation.
  2180. "With new telescopes coming online within the next five or ten years, we'll really have a chance to figure out whether we're alone in the universe," says Lisa Kaltenegger, an astronomer and director of Cornell's new Institute for Pale Blue Dots, which will search for habitable planets. "For the first time in human history, we might have the capability to do this."
  2181. """,
  2182.      "science"),
  2183.     ("""'Magic' Mushrooms in Royal Garden: What Is Fly Agaric?
  2184. Hallucinogenic mushrooms are perhaps the last thing you'd expect to find growing in the Queen of England's garden.
  2185. Yet a type of mushroom called Amanita muscaria — commonly known as fly agaric, or fly amanita — was found growing in the gardens of Buckingham Palace by the producers of a television show, the Associated Press reported on Friday (Dec. 12).
  2186. A. muscaria is a bright red-and-white mushroom, and the fungus is psychoactive when consumed.
  2187. """,
  2188.      "science"),
  2189.     ("""Upcoming Parks : 'Lost Corner' Finds New Life in Sandy Springs
  2190. At the corner of Brandon Mill Road, where Johnson Ferry Road turns into Dalrymple Road, tucked among 24 forested acres, sits an early 20th Century farmhouse. A vestige of Sandy Springs' past, the old home has found new life as the centerpiece of Lost Forest Preserve. While the preserve isn't slated to officially debut until some time next year, the city has opened the hiking trails to the public until construction begins on the permanent parking lot (at the moment the parking lot is a mulched area). The new park space includes community garden plots, a 4,000-foot-long hiking trail and an ADA-accessible trail through the densely wooded site. For Atlantans seeking an alternate escape to serenity (or those who dig local history), it's certainly worth a visit.
  2191. """,
  2192.      "science"),
  2193.     ("""Stargazers across the world got a treat this weekend when the Geminids meteor shower gave the best holiday displays a run for their money.
  2194. The meteor shower is called the "Geminids" because they appear as though they are shooting out of the constellation of Gemini. The meteors are thought to be small pieces of an extinct comment called 3200 Phaeton, a dust cloud revolving around the sun. Phaeton is thought to have lost all of its gas and to be slowly breaking apart into small particles.
  2195. Earth runs into a stream of debris from 3200 Phaethon every year in mid-December, causing a shower of meteors, which hit its peak over the weekend.
  2196. """,
  2197.      "science"),
  2198.     ("""Envisioning a River of Air
  2199. By the classification rules of the world of physics, we all know that the Earth's atmosphere is made of gas (rather than liquid, solid, or plasma). But in the world of flying it's often useful to think
  2200. """,
  2201.      "science"),
  2202.     ("""Following Sunday's 17-7 loss to the Seattle Seahawks, the San Francisco 49ers were officially eliminated from playoff contention, and they have referee Ed Hochuli to blame. OK, so they have a lot of folks to point the finger at for their 7-7 record, but Hochuli's incorrect call is the latest and easiest scapegoat.
  2203. """
  2204.      , "sport"),
  2205.     ("""Kobe Bryant and his teammates have an odd relationship. That makes sense: Kobe Bryant is an odd guy, and the Los Angeles Lakers are an odd team.
  2206. They’re also, for the first time this season, the proud owners of a three-game winning streak. On top of that, you may have heard, Kobe Bryant passed Michael Jordan on Sunday evening to move into third place on the NBA’s all-time scoring list.
  2207. """
  2208.      , "sport"),
  2209.     ("""The Patriots continued their divisional dominance and are close to clinching home-field advantage throughout the AFC playoffs. Meanwhile, both the Colts and Broncos again won their division titles with head-to-head wins.The Bills' upset of the Packers delivered a big blow to Green Bay's shot at clinching home-field advantage throughout the NFC playoffs. Detroit seized on the opportunity and now leads the NFC North.
  2210. """
  2211.      , "sport"),
  2212.     ("""If you thought the Washington Redskins secondary was humbled by another scintillating performance from New Yorks Giants rookie wide receiver sensation Odell Beckham Jr., think again.In what is becoming a weekly occurrence, Beckham led NFL highlight reels on Sunday, collecting 12 catches for 143 yards and three touchdowns in Sunday's 24-13 victory against an NFC East rival.
  2213. """
  2214.      , "sport")
  2215.     , ("""That was two touchdowns and 110 total yards for the three running backs. We break down the fantasy implications.The New England Patriots' rushing game has always been tough to handicap. Sunday, all three of the team's primary running backs put up numbers, and all in different ways, but it worked for the team, as the Patriots beat the Miami Dolphins, 41-13.
  2216. """
  2217.        , "sport"),
  2218.     ("""General Santos (Philippines) (AFP) - Philippine boxing legend Manny Pacquiao vowed to chase Floyd Mayweather into ring submission after his US rival offered to fight him next year in a blockbuster world title face-off. "He (Mayweather) has reached a dead end. He has nowhere to run but to fight me," Pacquiao told AFP late Saturday, hours after the undefeated Mayweather issued the May 2 challenge on US television. The two were long-time rivals as the "best pound-for-pound" boxers of their generation, but the dream fight has never materialised to the disappointment of the boxing world.
  2219. """
  2220.      , "sport"),
  2221.     ("""When St. John's landed Rysheed Jordan, the consensus was that he would be an excellent starter.
  2222. So far, that's half true.
  2223. Jordan came off the bench Sunday and tied a career high by scoring 24 points to lead No. 24 St. John's to a 74-53 rout of Fordham in the ECAC Holiday Festival.
  2224. ''I thought Rysheed played with poise,'' Red Storm coach Steve Lavin said. ''Played with the right pace. Near perfect game.''
  2225. """
  2226.      , "sport"),
  2227.     ("""Five-time world player of the year Marta scored three goals to lead Brazil to a 3-2 come-from-behind win over the U.S. women's soccer team in the International Tournament of Brasilia on Sunday. Carli Lloyd and Megan Rapinoe scored a goal each in the first 10 minutes to give the U.S. an early lead, but Marta netted in the 19th, 55th and 66th minutes to guarantee the hosts a spot in the final of the four-team competition.
  2228. """
  2229.      , "sport"),
  2230. ]
  2231.  
  2232. def getwords(doc):
  2233.     splitter = re.compile('\\W*')
  2234.     words = [word.lower() for word in splitter.split(doc) if len(word) > 2 and len(word) < 20]
  2235.  
  2236.     return dict([(word, 1) for word in words])
  2237.  
  2238.  
  2239. class documentClassifier:
  2240.     def __init__(self, getfeatures, filename=None):
  2241.         self.featureCountsPerCategory = {}
  2242.         self.categoryCounts = {}
  2243.         self.getfeatures = getfeatures
  2244.  
  2245.     def incrementFeatureCountsPerCategory(self, currentFeature, currentCategory):
  2246.         self.featureCountsPerCategory.setdefault(currentFeature, {})
  2247.         self.featureCountsPerCategory[currentFeature].setdefault(currentCategory, 0)
  2248.         self.featureCountsPerCategory[currentFeature][currentCategory] += 1
  2249.  
  2250.     def incrementCategoryCounts(self, cat):
  2251.         self.categoryCounts.setdefault(cat, 0)
  2252.         self.categoryCounts[cat] += 1
  2253.  
  2254.     def getFeatureCountsPerCategory(self, currentFeature, currentCategory):
  2255.         if currentFeature in self.featureCountsPerCategory and currentCategory in self.featureCountsPerCategory[
  2256.             currentFeature]:
  2257.             return float(self.featureCountsPerCategory[currentFeature][currentCategory])
  2258.         return 0.0
  2259.  
  2260.     def getCategoryCount(self, currentCategory):
  2261.         if currentCategory in self.categoryCounts:
  2262.             return float(self.categoryCounts[currentCategory])
  2263.         return 0
  2264.  
  2265.     def getTotal(self):
  2266.         return sum(self.categoryCounts.values())
  2267.  
  2268.     def categories(self):
  2269.         return self.categoryCounts.keys()
  2270.  
  2271.     def train(self, item, currentCategory):
  2272.         features = self.getfeatures(item)
  2273.         for currentFeature in features:
  2274.             self.incrementFeatureCountsPerCategory(currentFeature, currentCategory)
  2275.         self.incrementCategoryCounts(currentCategory)
  2276.  
  2277.     def getFeaturePerCategoryProbability(self, currentFeature, currentCategory):
  2278.         if self.getCategoryCount(currentCategory) == 0: return 0
  2279.         return self.getFeatureCountsPerCategory(currentFeature, currentCategory) / self.getCategoryCount(
  2280.             currentCategory)
  2281.  
  2282.     def weightedprob(self, currentFeature, currentCategory, prf, weight=1.0, ap=0.5):
  2283.         basicprob = prf(currentFeature, currentCategory)
  2284.         totals = sum([self.getFeatureCountsPerCategory(currentFeature, currentCategory) for currentCategory in
  2285.                       self.categories()])
  2286.         bp = ((weight * ap) + (totals * basicprob)) / (weight + totals)
  2287.         return bp
  2288.  
  2289.  
  2290. class naivebayes(documentClassifier):
  2291.     def __init__(self, getfeatures):
  2292.         documentClassifier.__init__(self, getfeatures)
  2293.         self.thresholds = {}
  2294.  
  2295.     def setThreshold(self, currentCategory, threshold):
  2296.         self.thresholds[currentCategory] = threshold
  2297.  
  2298.     def getThreshold(self, currentCategory):
  2299.         if currentCategory not in self.thresholds: return 1.0
  2300.         return self.thresholds[currentCategory]
  2301.  
  2302.     def calculateDocumentProbabilityInClass(self, item, currentCategory):
  2303.         features = self.getfeatures(item)
  2304.         p = 1
  2305.         for currentFeature in features:
  2306.             p *= self.weightedprob(currentFeature, currentCategory, self.getFeaturePerCategoryProbability)
  2307.  
  2308.         return p
  2309.  
  2310.     def getCategoryProbabilityForDocument(self, item, currentCategory):
  2311.         catprob = self.getCategoryCount(currentCategory) / self.getTotal()
  2312.         calculateDocumentProbabilityInClass = self.calculateDocumentProbabilityInClass(item, currentCategory)
  2313.  
  2314.         return calculateDocumentProbabilityInClass * catprob / (1.0 / self.getTotal())
  2315.  
  2316.     def classifyDocument(self, item, default=None):
  2317.         probs = {}
  2318.         max = 0.0
  2319.  
  2320.         for cat in self.categories():
  2321.             probs[cat] = self.getCategoryProbabilityForDocument(item, cat)
  2322.             if probs[cat] > max:
  2323.                 max = probs[cat]
  2324.                 best = cat
  2325.  
  2326.         for cat in probs:
  2327.             if cat == best: continue
  2328.             if probs[cat] * self.getThreshold(best) > probs[best]: return default
  2329.         return best
  2330.  
  2331.  
  2332.  
  2333. def trainClassifier(cl, data):
  2334.     for el in data:
  2335.         cl.train(el[0],el[1])
  2336.  
  2337.  
  2338.  
  2339.  
  2340.  
  2341.  
  2342.  
  2343.  
  2344.  
  2345. if __name__ == "__main__":
  2346.     cl = naivebayes(getwords)
  2347.     trainClassifier(cl, train_data)
  2348.     recenica = input()
  2349.     cl.setThreshold('science',1)
  2350.     klasa=cl.classifyDocument(recenica)
  2351.     pom=cl.getCategoryProbabilityForDocument(recenica,klasa)
  2352.     verojatnost=0
  2353.     verojatnost=round(math.log(pom)/math.log(2),4)
  2354.  
  2355.  
  2356.  
  2357.     zborovi =getwords(recenica)
  2358.     kategorii = cl.categories()
  2359.     for zbor in zborovi:
  2360.         for kategorija in kategorii:
  2361.             verojatnostNaZbor = round(cl.getFeaturePerCategoryProbability(zbor,kategorija),4)
  2362.             verojatnostNaZborTezinska = round(cl.weightedprob(zbor,kategorija,cl.getFeaturePerCategoryProbability),4)
  2363.             print zbor, kategorija, verojatnostNaZbor, verojatnostNaZborTezinska
  2364.     #klasa = 'bad'
  2365.     #verojatnost = 0
  2366.     print klasa, verojatnost
  2367. -----------------------------------------------------------------------------------------------------
  2368. Klasifikacija ispit januari-courses
  2369.  
  2370. Заради потребата на софистицирана класификација на документи, веќе е имплементирана и достапна во почетниот
  2371. код фунцијата getwords_with_ignore која ги дава уникатните зборовите од еден документ така што зборовите
  2372. кои се веќе во интерната променлива words_to_ingore се игнорираат. Значи секој збор во words_to_ingore не
  2373. фигурира во речникот со уникатни зборови кој се добива како резултат на getwords_with_ignore.
  2374.  
  2375. Множеството на податоци train_data е предефинирано. Притоа се знае секој документ од која класа е
  2376. (science или sport). Mножеството е претставено како листи од торки, така што во секоја торка прв елемент е
  2377. текстот на документот како стринг, а втор елемент е класата како стринг. Да се истренира класификатор со
  2378. користење на стандардната getwords (од аудиториските вежби) врз основа на тренинг множеството. Исто така
  2379. да се направат потребните промени за да се истренира и втор класификатор кој ќе го употребува истото
  2380. тренинг множество, но притоа ќе ја употребува новата функција која е веќе имплементирана
  2381. getwords_with_ignore.
  2382.  
  2383. Потоа за секој документ прочитан од стандарден влез да се испечатат 2 реда. Првиот ред ја содржи
  2384. предвидената класа со стандардниот класификатор и логаритам со основа 2 од веројатноста со која се
  2385. предвидува (заокружено на 4 децимали), а вториот ред предвидената класа со помош на вториот класификатор и
  2386. логаритам со основа 2 од веројатноста со која се предвидува (заокружено на 4 децимали). Да се испечати
  2387. колку пати втората веројатност е поголема од првата заокружено на 4 децимали. Ако предвидувањето на двата
  2388. класификатори е различно да се испчати уште еден ред со зборот “kontradikcija”.
  2389.  
  2390.     Vlez:
  2391.         """Just last week, preservationists at the Old Pejeta animal sanctuary in Kenya conceded
  2392.        that their one male and two female northern white rhinos will not reproduce naturally.
  2393.        The animals were flown from the Czech zoo to the Kenyan conservancy in December 2009 in
  2394.        hopes that the natural environment could be easier for them to breed there than in captivity."""
  2395.     Izlez:
  2396. science -51.5029
  2397. science -46.0544
  2398. 43.6706
  2399.  
  2400. Vlez 2:
  2401.        
  2402. """HONOLULU (AP) — Lava from a volcano on Hawaii's Big Island is on course to reach a shopping center
  2403. with a gas station and a supermarket in seven to 10 days, officials said Monday."""
  2404.     Izlez 2:
  2405. sport -21.1937
  2406. science -17.6781
  2407. 11.4370
  2408. kontradikcija
  2409.  
  2410. ====*===*====*============*****============***=============**=============***
  2411. #!/usr/bin/python
  2412. # -*- coding: utf-8 -*-
  2413.  
  2414. import re
  2415. import math
  2416.  
  2417.  
  2418.  
  2419. train_data=[
  2420. ("""What Are We Searching for on Mars?
  2421. Martians terrified me growing up. I remember watching the 1996 movie Mars Attacks! and fearing that the Red Planet harbored hostile alien neighbors. Though I was only 6 at the time, I was convinced life on Mars meant little green men wielding vaporizer guns. There was a time, not so long ago, when such an assumption about Mars wouldn’t have seemed so far-fetched.
  2422. Like a child watching a scary movie, people freaked out after listening to “The War of the Worlds,” the now-infamous 1938 radio drama that many listeners believed was a real report about an invading Martian army. Before humans left Earth, humanity’s sense of what—or who—might be in our galactic neighborhood was, by today’s standards, remarkably optimistic.
  2423. """,
  2424. "science"),
  2425. ("""Mountains of Ice are Melting, But Don't Panic (Op-Ed)
  2426. If the planet lost the entire West Antarctic ice sheet, global sea level would rise 11 feet, threatening nearly 13 million people worldwide and affecting more than $2 trillion worth of property.
  2427. Ice loss from West Antarctica has been increasing nearly three times faster in the past decade than during the previous one — and much more quickly than scientists predicted.
  2428. This unprecedented ice loss is occurring because warm ocean water is rising from below and melting the base of the glaciers, dumping huge volumes of additional water — the equivalent of a Mt. Everest every two years — into the ocean.
  2429. """,
  2430. "science"),
  2431. ("""Some scientists think we'll find signs of aliens within our lifetimes. Here's how.
  2432. Finding extraterrestrial life is the essence of science fiction. But it's not so far-fetched to predict that we might find evidence of life on a distant planet within a generation.
  2433. "With new telescopes coming online within the next five or ten years, we'll really have a chance to figure out whether we're alone in the universe," says Lisa Kaltenegger, an astronomer and director of Cornell's new Institute for Pale Blue Dots, which will search for habitable planets. "For the first time in human history, we might have the capability to do this."
  2434. """,
  2435. "science"),
  2436. ("""'Magic' Mushrooms in Royal Garden: What Is Fly Agaric?
  2437. Hallucinogenic mushrooms are perhaps the last thing you'd expect to find growing in the Queen of England's garden.
  2438. Yet a type of mushroom called Amanita muscaria — commonly known as fly agaric, or fly amanita — was found growing in the gardens of Buckingham Palace by the producers of a television show, the Associated Press reported on Friday (Dec. 12).
  2439. A. muscaria is a bright red-and-white mushroom, and the fungus is psychoactive when consumed.
  2440. """,
  2441. "science"),
  2442. ("""Upcoming Parks : 'Lost Corner' Finds New Life in Sandy Springs
  2443. At the corner of Brandon Mill Road, where Johnson Ferry Road turns into Dalrymple Road, tucked among 24 forested acres, sits an early 20th Century farmhouse. A vestige of Sandy Springs' past, the old home has found new life as the centerpiece of Lost Forest Preserve. While the preserve isn't slated to officially debut until some time next year, the city has opened the hiking trails to the public until construction begins on the permanent parking lot (at the moment the parking lot is a mulched area). The new park space includes community garden plots, a 4,000-foot-long hiking trail and an ADA-accessible trail through the densely wooded site. For Atlantans seeking an alternate escape to serenity (or those who dig local history), it's certainly worth a visit.
  2444. """,
  2445. "science"),
  2446. ("""Stargazers across the world got a treat this weekend when the Geminids meteor shower gave the best holiday displays a run for their money.
  2447. The meteor shower is called the "Geminids" because they appear as though they are shooting out of the constellation of Gemini. The meteors are thought to be small pieces of an extinct comment called 3200 Phaeton, a dust cloud revolving around the sun. Phaeton is thought to have lost all of its gas and to be slowly breaking apart into small particles.
  2448. Earth runs into a stream of debris from 3200 Phaethon every year in mid-December, causing a shower of meteors, which hit its peak over the weekend.
  2449. """,
  2450. "science"),
  2451. ("""Envisioning a River of Air
  2452. By the classification rules of the world of physics, we all know that the Earth's atmosphere is made of gas (rather than liquid, solid, or plasma). But in the world of flying it's often useful to think
  2453. """,
  2454. "science"),
  2455. ("""Following Sunday's 17-7 loss to the Seattle Seahawks, the San Francisco 49ers were officially eliminated from playoff contention, and they have referee Ed Hochuli to blame. OK, so they have a lot of folks to point the finger at for their 7-7 record, but Hochuli's incorrect call is the latest and easiest scapegoat.
  2456. """
  2457. ,"sport"),
  2458. ("""Kobe Bryant and his teammates have an odd relationship. That makes sense: Kobe Bryant is an odd guy, and the Los Angeles Lakers are an odd team.
  2459. They’re also, for the first time this season, the proud owners of a three-game winning streak. On top of that, you may have heard, Kobe Bryant passed Michael Jordan on Sunday evening to move into third place on the NBA’s all-time scoring list.
  2460. """
  2461. ,"sport"),
  2462. ("""The Patriots continued their divisional dominance and are close to clinching home-field advantage throughout the AFC playoffs. Meanwhile, both the Colts and Broncos again won their division titles with head-to-head wins.The Bills' upset of the Packers delivered a big blow to Green Bay's shot at clinching home-field advantage throughout the NFC playoffs. Detroit seized on the opportunity and now leads the NFC North.
  2463. """
  2464. ,"sport"),
  2465. ("""If you thought the Washington Redskins secondary was humbled by another scintillating performance from New Yorks Giants rookie wide receiver sensation Odell Beckham Jr., think again.In what is becoming a weekly occurrence, Beckham led NFL highlight reels on Sunday, collecting 12 catches for 143 yards and three touchdowns in Sunday's 24-13 victory against an NFC East rival.
  2466. """
  2467. ,"sport")
  2468. ,("""That was two touchdowns and 110 total yards for the three running backs. We break down the fantasy implications.The New England Patriots' rushing game has always been tough to handicap. Sunday, all three of the team's primary running backs put up numbers, and all in different ways, but it worked for the team, as the Patriots beat the Miami Dolphins, 41-13.
  2469. """
  2470. ,"sport"),
  2471. ("""General Santos (Philippines) (AFP) - Philippine boxing legend Manny Pacquiao vowed to chase Floyd Mayweather into ring submission after his US rival offered to fight him next year in a blockbuster world title face-off. "He (Mayweather) has reached a dead end. He has nowhere to run but to fight me," Pacquiao told AFP late Saturday, hours after the undefeated Mayweather issued the May 2 challenge on US television. The two were long-time rivals as the "best pound-for-pound" boxers of their generation, but the dream fight has never materialised to the disappointment of the boxing world.
  2472. """
  2473. ,"sport"),
  2474. ("""When St. John's landed Rysheed Jordan, the consensus was that he would be an excellent starter.
  2475. So far, that's half true.
  2476. Jordan came off the bench Sunday and tied a career high by scoring 24 points to lead No. 24 St. John's to a 74-53 rout of Fordham in the ECAC Holiday Festival.
  2477. ''I thought Rysheed played with poise,'' Red Storm coach Steve Lavin said. ''Played with the right pace. Near perfect game.''
  2478. """
  2479. ,"sport"),
  2480. ("""Five-time world player of the year Marta scored three goals to lead Brazil to a 3-2 come-from-behind win over the U.S. women's soccer team in the International Tournament of Brasilia on Sunday. Carli Lloyd and Megan Rapinoe scored a goal each in the first 10 minutes to give the U.S. an early lead, but Marta netted in the 19th, 55th and 66th minutes to guarantee the hosts a spot in the final of the four-team competition.
  2481. """
  2482. ,"sport"),
  2483. ]
  2484.  
  2485.  
  2486. def getwords(doc, words_to_ignore=None):
  2487.     splitter = re.compile('\\W*')
  2488.  
  2489.     words = [word.lower() for word in splitter.split(doc) if
  2490.              len(word) > 2 and len(word) < 20 and (words_to_ignore is None or word.lower() not in words_to_ignore)]
  2491.  
  2492.     return dict([(word, 1) for word in words])
  2493.  
  2494. def getwords_with_ignore(doc,words_to_ignore=['and', 'are', 'for', 'was', 'what', 'when', 'who', 'but', 'from', 'after', 'out', 'our', 'my', 'the', 'with', 'some', 'not', 'this', 'that']):
  2495.     return getwords(doc,words_to_ignore)
  2496.  
  2497. def getwords2(doc,words_to_ignore=None):
  2498.     splitter=re.compile('\\W*')
  2499.  
  2500.     words=[word.lower() for word in splitter.split(doc) if len(word)>2 and len(word)<20 and (words_to_ignore is None or word.lower() not in words_to_ignore)]
  2501.     return dict([(word,1) for word in words])
  2502.  
  2503.  
  2504. words_to_ignore=['and', 'are', 'for', 'was', 'what', 'when', 'who', 'but', 'from', 'after', 'out', 'our', 'my', 'the', 'with', 'some', 'not', 'this', 'that']
  2505.  
  2506. def trainClassifier(cl, data):
  2507.     for i in data:
  2508.         cl.train(i[0],i[1])
  2509.     # cl.train(data[0][0],data[0][1])
  2510.  
  2511.  
  2512. class documentClassifier:
  2513.     def __init__(self, getfeatures, filename=None):
  2514.         # Broj na parovi atribut/kategorija (feature/category)
  2515.         self.featureCountsPerCategory = {}
  2516.         # Broj na dokumenti vo sekoja kategorija
  2517.         self.categoryCounts = {}
  2518.         # funkcija za dobivanje na atributite (zborovite) vo dokumentot
  2519.         self.getfeatures = getfeatures
  2520.  
  2521.     # Zgolemuvanje na brojot na parovi atribut/kategorija
  2522.     def incrementFeatureCountsPerCategory(self, currentFeature, currentCategory):
  2523.         self.featureCountsPerCategory.setdefault(currentFeature, {})
  2524.         self.featureCountsPerCategory[currentFeature].setdefault(currentCategory, 0)
  2525.         self.featureCountsPerCategory[currentFeature][currentCategory] += 1
  2526.  
  2527.     # Zgolemuvanje na brojot na predmeti(dokumenti) vo kategorija
  2528.     def incrementCategoryCounts(self, cat):
  2529.         self.categoryCounts.setdefault(cat, 0)
  2530.         self.categoryCounts[cat] += 1
  2531.  
  2532.     # Dobivanje na brojot kolku pati
  2533.     # odreden atribut se ima pojaveno vo odredena kategorija
  2534.     def getFeatureCountsPerCategory(self, currentFeature, currentCategory):
  2535.         if currentFeature in self.featureCountsPerCategory and currentCategory in self.featureCountsPerCategory[
  2536.             currentFeature]:
  2537.             return float(self.featureCountsPerCategory[currentFeature][currentCategory])
  2538.         return 0.0
  2539.  
  2540.     # Dobivanje na brojot na predmeti(dokumenti) vo kategorija
  2541.     def getCategoryCount(self, currentCategory):
  2542.         if currentCategory in self.categoryCounts:
  2543.             return float(self.categoryCounts[currentCategory])
  2544.         return 0
  2545.  
  2546.     # Dobivanje na vkupniot broj na predmeti
  2547.     def getTotalCount(self):
  2548.         return sum(self.categoryCounts.values())
  2549.  
  2550.     # Dobivanje na lista na site kategorii
  2551.     def categories(self):
  2552.         return self.categoryCounts.keys()
  2553.  
  2554.     # Treniranje na klasifikatorot
  2555.     # Noviot predmet (dokument) item pripagja na kategorijata cat
  2556.     def train(self, item, currentCategory):
  2557.         # Se zemaat atributite (zborovite) vo predmetot(dokumentot)
  2558.         features = self.getfeatures(item)
  2559.         # Se zgolemuva brojot na sekoj atribut vo ovaa kategorija
  2560.         for currentFeature in features:
  2561.             self.incrementFeatureCountsPerCategory(currentFeature, currentCategory)
  2562.  
  2563.         # Se zgolemuva brojot na predmeti (dokumenti) vo ovaa kategorija
  2564.         self.incrementCategoryCounts(currentCategory)
  2565.  
  2566.     def getFeaturePerCategoryProbability(self, currentFeature, currentCategory):
  2567.         if self.getCategoryCount(currentCategory) == 0: return 0
  2568.         # Verojatnosta e vkupniot broj na pati koga
  2569.         # ovoj atribut f (zbor) se pojavil vo ovaa
  2570.         # kategorija podeleno so vkupniot broj na
  2571.         # predmeti (dokumenti) vo ovaa kategorija
  2572.         return self.getFeatureCountsPerCategory(currentFeature, currentCategory) / self.getCategoryCount(
  2573.             currentCategory)
  2574.  
  2575.     def weightedprob(self, currentFeature, currentCategory, prf, weight=1.0, ap=0.5):
  2576.         # Presmetaj ja osnovnata verojatnost
  2577.         basicprob = prf(currentFeature, currentCategory)
  2578.         # Izbroj kolku pati se ima pojaveno ovoj atribut (zbor)
  2579.         # vo site kategorii
  2580.         totals = sum([self.getFeatureCountsPerCategory(currentFeature, currentCategory) for currentCategory in
  2581.                       self.categories()])
  2582.         # Presmetaj ja tezinski usrednetata verojatnost
  2583.         bp = ((weight * ap) + (totals * basicprob)) / (weight + totals)
  2584.         return bp
  2585.  
  2586.  
  2587. class naivebayes(documentClassifier):
  2588.     def __init__(self, getfeatures):
  2589.         documentClassifier.__init__(self, getfeatures)
  2590.         self.thresholds = {}
  2591.  
  2592.     def setThreshold(self, currentCategory, threshold):
  2593.         self.thresholds[currentCategory] = threshold
  2594.  
  2595.     def getThreshold(self, currentCategory):
  2596.         if currentCategory not in self.thresholds: return 1.0
  2597.         return self.thresholds[currentCategory]
  2598.  
  2599.     # ja vrakja verojatnosta na dokumentot da e od klasata cat (cat e odnapred poznata)
  2600.     def caclulateDocumentProbabilityInClass(self, item, currentCategory):
  2601.         # zemi gi zborovite vo dokumentot item
  2602.         features = self.getfeatures(item)
  2603.         # pomnozi gi verojatnostite na site zborovi
  2604.         p = 1
  2605.         for currentFeature in features:
  2606.             p *= self.weightedprob(currentFeature, currentCategory,
  2607.                                    self.getFeaturePerCategoryProbability)
  2608.  
  2609.         return p
  2610.  
  2611.     # Ja vrakja verojatnosta na klasata ako e poznat dokumentot
  2612.     def getCategoryProbabilityForDocument(self, item, currentCategory):
  2613.         catprob = self.getCategoryCount(currentCategory) / self.getTotalCount()
  2614.         caclulateDocumentProbabilityInClass = self.caclulateDocumentProbabilityInClass(item, currentCategory)
  2615.         # Bayes Theorem
  2616.         return caclulateDocumentProbabilityInClass * catprob / (1.0 / self.getTotalCount())
  2617.  
  2618.     # klasificiranje na dokument
  2619.     def classifyDocument(self, item, default=None):
  2620.         probs = {}
  2621.         # najdi ja kategorijata (klasata)
  2622.         # so najgolema verojatnost
  2623.         max = 0.0
  2624.         for cat in self.categories():
  2625.             probs[cat] = self.getCategoryProbabilityForDocument(item, cat)
  2626.             if probs[cat] > max:
  2627.                 max = probs[cat]
  2628.                 best = cat
  2629.  
  2630.         # proveri dali verojatnosta e pogolema od
  2631.         # threshold*next best (sledna najdobra)
  2632.         for cat in probs:
  2633.             if cat == best: continue
  2634.             if probs[cat] * self.getThreshold(best) > probs[best]: return default
  2635.  
  2636.         return best
  2637.  
  2638.  
  2639. if __name__ == "__main__":
  2640.     #recenica = """Just last week, preservationists at the Old Pejeta animal sanctuary in Kenya conceded that their one male and two female northern white rhinos will not reproduce naturally. The animals were flown from the Czech zoo to the Kenyan conservancy in December 2009 in hopes that the natural environment could be easier for them to breed there than in captivity."""
  2641.     recenica = input()
  2642.     cl = naivebayes(getwords)
  2643.     cl2 = naivebayes(getwords_with_ignore)
  2644.     trainClassifier(cl,train_data)
  2645.     trainClassifier(cl2,train_data)
  2646.     klasa1 = cl.classifyDocument(recenica)
  2647.     klasa2 = cl2.classifyDocument(recenica)
  2648.     verojatnost1 = cl.getCategoryProbabilityForDocument(recenica, klasa1)
  2649.     verojatnost2 = cl2.getCategoryProbabilityForDocument(recenica, klasa2)
  2650.     # print (klasa1, klasa2)
  2651.     print klasa1,"%.4f" % math.log(verojatnost1,2)
  2652.     print klasa2,"%.4f" % math.log(verojatnost2,2)
  2653.     print "%.4f" % (verojatnost2/verojatnost1)
  2654.     if klasa1 != klasa2:
  2655.         print 'kontradikcija'
  2656. ------------------------------------------------------------------------------------------------------
  2657. #-*- coding: utf-8 -*-
  2658.  
  2659. """
  2660. Класификација на документи (испит)
  2661. Заради потребата на софистицирана класификација на документи, фунцијата која ги дава уникатните зборовите од
  2662. еден документ getwords треба да се промени така што ќе прима втор опционален аргумент words_to_ingore. Ако не
  2663. се проследи опционалниот аргумент подразбирлива вредност е None. Кога се проследува вредност таа вредност треба
  2664. да биде листа од зборови кои функцијата ќе ги изоставува при генерирањето на излезниот речник. Значи секој збор
  2665. во words_to_ingore не треба да фигурира во речникот со уникатни зборови кој се добива како резултат на getwords.
  2666.  
  2667. Множеството на податоци train_data е предефинирано.
  2668. Притоа се знае секој документ од која класа е (science или sport).
  2669. Mножеството е претставено како листи од торки, така што во секоја торка прв елемент е текстот на документот како стринг,
  2670. а втор елемент е класата како стринг. Да се истренира класификатор со користење на стандардната getwords (од аудиториските вежби)
  2671. врз основа на тренинг множеството. Исто така да се направат потребните промени за да се истренира и втор
  2672. класификатор кој ќе го употребува истото тренинг множество, но притоа ќе ја употребува новата функција која е веќе
  2673. имплементирана getwords_with_ignore.
  2674.  
  2675. Потоа за секој документ прочитан од стандарден влез да се испечатат 2 реда.
  2676. Првиот ред ја содржи предвидената класа со стандардниот класификатор, а вториот ред предвидената класа со помош на
  2677. вториот класификатор. Ако предвидувањето на двата класификатори е различно
  2678. да се испчати уште еден ред со зборот “kontradikcija”.
  2679.  
  2680. """
  2681.  
  2682. import re
  2683. import math
  2684.  
  2685. train_data=[
  2686. ("""What Are We Searching for on Mars?
  2687. Martians terrified me growing up. I remember watching the 1996 movie Mars Attacks! and fearing that the Red Planet harbored hostile alien neighbors. Though I was only 6 at the time, I was convinced life on Mars meant little green men wielding vaporizer guns. There was a time, not so long ago, when such an assumption about Mars wouldn’t have seemed so far-fetched.
  2688. Like a child watching a scary movie, people freaked out after listening to “The War of the Worlds,” the now-infamous 1938 radio drama that many listeners believed was a real report about an invading Martian army. Before humans left Earth, humanity’s sense of what—or who—might be in our galactic neighborhood was, by today’s standards, remarkably optimistic.
  2689. """,
  2690. "science"),
  2691. ("""Mountains of Ice are Melting, But Don't Panic (Op-Ed)
  2692. If the planet lost the entire West Antarctic ice sheet, global sea level would rise 11 feet, threatening nearly 13 million people worldwide and affecting more than $2 trillion worth of property.
  2693. Ice loss from West Antarctica has been increasing nearly three times faster in the past decade than during the previous one — and much more quickly than scientists predicted.
  2694. This unprecedented ice loss is occurring because warm ocean water is rising from below and melting the base of the glaciers, dumping huge volumes of additional water — the equivalent of a Mt. Everest every two years — into the ocean.
  2695. """,
  2696. "science"),
  2697. ("""Some scientists think we'll find signs of aliens within our lifetimes. Here's how.
  2698. Finding extraterrestrial life is the essence of science fiction. But it's not so far-fetched to predict that we might find evidence of life on a distant planet within a generation.
  2699. "With new telescopes coming online within the next five or ten years, we'll really have a chance to figure out whether we're alone in the universe," says Lisa Kaltenegger, an astronomer and director of Cornell's new Institute for Pale Blue Dots, which will search for habitable planets. "For the first time in human history, we might have the capability to do this."
  2700. """,
  2701. "science"),
  2702. ("""'Magic' Mushrooms in Royal Garden: What Is Fly Agaric?
  2703. Hallucinogenic mushrooms are perhaps the last thing you'd expect to find growing in the Queen of England's garden.
  2704. Yet a type of mushroom called Amanita muscaria — commonly known as fly agaric, or fly amanita — was found growing in the gardens of Buckingham Palace by the producers of a television show, the Associated Press reported on Friday (Dec. 12).
  2705. A. muscaria is a bright red-and-white mushroom, and the fungus is psychoactive when consumed.
  2706. """,
  2707. "science"),
  2708. ("""Upcoming Parks : 'Lost Corner' Finds New Life in Sandy Springs
  2709. At the corner of Brandon Mill Road, where Johnson Ferry Road turns into Dalrymple Road, tucked among 24 forested acres, sits an early 20th Century farmhouse. A vestige of Sandy Springs' past, the old home has found new life as the centerpiece of Lost Forest Preserve. While the preserve isn't slated to officially debut until some time next year, the city has opened the hiking trails to the public until construction begins on the permanent parking lot (at the moment the parking lot is a mulched area). The new park space includes community garden plots, a 4,000-foot-long hiking trail and an ADA-accessible trail through the densely wooded site. For Atlantans seeking an alternate escape to serenity (or those who dig local history), it's certainly worth a visit.
  2710. """,
  2711. "science"),
  2712. ("""Stargazers across the world got a treat this weekend when the Geminids meteor shower gave the best holiday displays a run for their money.
  2713. The meteor shower is called the "Geminids" because they appear as though they are shooting out of the constellation of Gemini. The meteors are thought to be small pieces of an extinct comment called 3200 Phaeton, a dust cloud revolving around the sun. Phaeton is thought to have lost all of its gas and to be slowly breaking apart into small particles.
  2714. Earth runs into a stream of debris from 3200 Phaethon every year in mid-December, causing a shower of meteors, which hit its peak over the weekend.
  2715. """,
  2716. "science"),
  2717. ("""Envisioning a River of Air
  2718. By the classification rules of the world of physics, we all know that the Earth's atmosphere is made of gas (rather than liquid, solid, or plasma). But in the world of flying it's often useful to think
  2719. """,
  2720. "science"),
  2721. ("""Following Sunday's 17-7 loss to the Seattle Seahawks, the San Francisco 49ers were officially eliminated from playoff contention, and they have referee Ed Hochuli to blame. OK, so they have a lot of folks to point the finger at for their 7-7 record, but Hochuli's incorrect call is the latest and easiest scapegoat.
  2722. """
  2723. ,"sport"),
  2724. ("""Kobe Bryant and his teammates have an odd relationship. That makes sense: Kobe Bryant is an odd guy, and the Los Angeles Lakers are an odd team.
  2725. They’re also, for the first time this season, the proud owners of a three-game winning streak. On top of that, you may have heard, Kobe Bryant passed Michael Jordan on Sunday evening to move into third place on the NBA’s all-time scoring list.
  2726. """
  2727. ,"sport"),
  2728. ("""The Patriots continued their divisional dominance and are close to clinching home-field advantage throughout the AFC playoffs. Meanwhile, both the Colts and Broncos again won their division titles with head-to-head wins.The Bills' upset of the Packers delivered a big blow to Green Bay's shot at clinching home-field advantage throughout the NFC playoffs. Detroit seized on the opportunity and now leads the NFC North.
  2729. """
  2730. ,"sport"),
  2731. ("""If you thought the Washington Redskins secondary was humbled by another scintillating performance from New Yorks Giants rookie wide receiver sensation Odell Beckham Jr., think again.In what is becoming a weekly occurrence, Beckham led NFL highlight reels on Sunday, collecting 12 catches for 143 yards and three touchdowns in Sunday's 24-13 victory against an NFC East rival.
  2732. """
  2733. ,"sport")
  2734. ,("""That was two touchdowns and 110 total yards for the three running backs. We break down the fantasy implications.The New England Patriots' rushing game has always been tough to handicap. Sunday, all three of the team's primary running backs put up numbers, and all in different ways, but it worked for the team, as the Patriots beat the Miami Dolphins, 41-13.
  2735. """
  2736. ,"sport"),
  2737. ("""General Santos (Philippines) (AFP) - Philippine boxing legend Manny Pacquiao vowed to chase Floyd Mayweather into ring submission after his US rival offered to fight him next year in a blockbuster world title face-off. "He (Mayweather) has reached a dead end. He has nowhere to run but to fight me," Pacquiao told AFP late Saturday, hours after the undefeated Mayweather issued the May 2 challenge on US television. The two were long-time rivals as the "best pound-for-pound" boxers of their generation, but the dream fight has never materialised to the disappointment of the boxing world.
  2738. """
  2739. ,"sport"),
  2740. ("""When St. John's landed Rysheed Jordan, the consensus was that he would be an excellent starter.
  2741. So far, that's half true.
  2742. Jordan came off the bench Sunday and tied a career high by scoring 24 points to lead No. 24 St. John's to a 74-53 rout of Fordham in the ECAC Holiday Festival.
  2743. ''I thought Rysheed played with poise,'' Red Storm coach Steve Lavin said. ''Played with the right pace. Near perfect game.''
  2744. """
  2745. ,"sport"),
  2746. ("""Five-time world player of the year Marta scored three goals to lead Brazil to a 3-2 come-from-behind win over the U.S. women's soccer team in the International Tournament of Brasilia on Sunday. Carli Lloyd and Megan Rapinoe scored a goal each in the first 10 minutes to give the U.S. an early lead, but Marta netted in the 19th, 55th and 66th minutes to guarantee the hosts a spot in the final of the four-team competition.
  2747. """
  2748. ,"sport"),
  2749. ]
  2750.  
  2751.  
  2752.  
  2753. words_to_ignore=['and', 'are', 'for', 'was', 'what', 'when', 'who', 'but', 'from', 'after', 'out', 'our', 'my', 'the', 'with', 'some', 'not', 'this', 'that']
  2754.  
  2755.  
  2756. def getwords(doc):
  2757.     # regularen izraz koj ke go deli stringot na zborovi
  2758.     # stringot se deli na zborovi na site prazni mesta i interpunkciski znaci
  2759.     splitter = re.compile('\\W*')
  2760.     # podeli go dokumentot na zborovi
  2761.     # i konvertiraj gi vo mali bukvi
  2762.     # pa potoa stavi gi vo lista
  2763.     # ako nivnata dolzina e >2 i <20
  2764.     words = [word.lower() for word in splitter.split(doc) if len(word) > 2 and len(word) < 20]
  2765.     # se vrakja recnik cii klucevi se zborovite koi
  2766.     # se vo dokumentot, a eden zbor duri i da se
  2767.     # srekjava poveke pati vo recnikot ke bide samo ednas
  2768.     # vrednosta (1) vo recnikot ne e potrebna
  2769.     return dict([(word, 1) for word in words])
  2770.  
  2771.  
  2772. def getwords2(doc):
  2773.     splitter = re.compile('\\W*')
  2774.     words = []
  2775.     for word in splitter.split(doc):
  2776.         if len(word) > 2 and len(word) < 20 and word.lower() not in words_to_ignore:
  2777.             words.append(word.lower())
  2778.  
  2779.     return dict([(word, 1) for word in words])
  2780.  
  2781.  
  2782.  
  2783. class documentClassifier:
  2784.     def __init__(self, getfeatures, filename=None):
  2785.         # Broj na parovi atribut/kategorija (feature/category)
  2786.         self.featureCountsPerCategory = {}
  2787.         # Broj na dokumenti vo sekoja kategorija
  2788.         self.categoryCounts = {}
  2789.         # funkcija za dobivanje na atributite (zborovite) vo dokumentot
  2790.         self.getfeatures = getfeatures
  2791.  
  2792.     # Zgolemuvanje na brojot na parovi atribut/kategorija
  2793.     def incrementFeatureCountsPerCategory(self, currentFeature, currentCategory):
  2794.         self.featureCountsPerCategory.setdefault(currentFeature, {})
  2795.         self.featureCountsPerCategory[currentFeature].setdefault(currentCategory, 0)
  2796.         self.featureCountsPerCategory[currentFeature][currentCategory] += 1
  2797.  
  2798.     # Zgolemuvanje na brojot na predmeti(dokumenti) vo kategorija
  2799.     def incrementCategoryCounts(self, cat):
  2800.         self.categoryCounts.setdefault(cat, 0)
  2801.         self.categoryCounts[cat] += 1
  2802.  
  2803.     # Dobivanje na brojot kolku pati
  2804.     # odreden atribut se ima pojaveno vo odredena kategorija
  2805.     def getFeatureCountsPerCategory(self, currentFeature, currentCategory):
  2806.         if currentFeature in self.featureCountsPerCategory and currentCategory in self.featureCountsPerCategory[
  2807.             currentFeature]:
  2808.             return float(self.featureCountsPerCategory[currentFeature][currentCategory])
  2809.         return 0.0
  2810.  
  2811.     # Dobivanje na brojot na predmeti(dokumenti) vo kategorija
  2812.     def getCategoryCount(self, currentCategory):
  2813.         if currentCategory in self.categoryCounts:
  2814.             return float(self.categoryCounts[currentCategory])
  2815.         return 0
  2816.  
  2817.     # Dobivanje na vkupniot broj na predmeti
  2818.     def getTotalCount(self):
  2819.         return sum(self.categoryCounts.values())
  2820.  
  2821.     # Dobivanje na lista na site kategorii
  2822.     def categories(self):
  2823.         return self.categoryCounts.keys()
  2824.  
  2825.     # Treniranje na klasifikatorot
  2826.     # Noviot predmet (dokument) item pripagja na kategorijata cat
  2827.     def train(self, item, currentCategory):
  2828.         # Se zemaat atributite (zborovite) vo predmetot(dokumentot)
  2829.         features = self.getfeatures(item)
  2830.         # Se zgolemuva brojot na sekoj atribut vo ovaa kategorija
  2831.         for currentFeature in features:
  2832.             self.incrementFeatureCountsPerCategory(currentFeature, currentCategory)
  2833.  
  2834.         # Se zgolemuva brojot na predmeti (dokumenti) vo ovaa kategorija
  2835.         self.incrementCategoryCounts(currentCategory)
  2836.  
  2837.     def getFeaturePerCategoryProbability(self, currentFeature, currentCategory):
  2838.         if self.getCategoryCount(currentCategory) == 0: return 0
  2839.         # Verojatnosta e vkupniot broj na pati koga
  2840.         # ovoj atribut f (zbor) se pojavil vo ovaa
  2841.         # kategorija podeleno so vkupniot broj na
  2842.         # predmeti (dokumenti) vo ovaa kategorija
  2843.         return self.getFeatureCountsPerCategory(currentFeature, currentCategory) / self.getCategoryCount(
  2844.             currentCategory)
  2845.  
  2846.     def weightedprob(self, currentFeature, currentCategory, prf, weight=1.0, ap=0.5):
  2847.         # Presmetaj ja osnovnata verojatnost
  2848.         basicprob = prf(currentFeature, currentCategory)
  2849.         # Izbroj kolku pati se ima pojaveno ovoj atribut (zbor)
  2850.         # vo site kategorii
  2851.         totals = sum([self.getFeatureCountsPerCategory(currentFeature, currentCategory) for currentCategory in
  2852.                       self.categories()])
  2853.         # Presmetaj ja tezinski usrednetata verojatnost
  2854.         bp = ((weight * ap) + (totals * basicprob)) / (weight + totals)
  2855.         return bp
  2856.  
  2857.  
  2858. class naivebayes(documentClassifier):
  2859.  
  2860.     def __init__(self, getfeatures):
  2861.         documentClassifier.__init__(self, getfeatures)
  2862.         self.thresholds = {}
  2863.  
  2864.     def setThreshold(self, currentCategory, threshold):
  2865.         self.thresholds[currentCategory] = threshold
  2866.  
  2867.     def getThreshold(self, currentCategory):
  2868.         if currentCategory not in self.thresholds: return 1.0
  2869.         return self.thresholds[currentCategory]
  2870.  
  2871.     # ja vrakja verojatnosta na dokumentot da e od klasata cat (cat e odnapred poznata)
  2872.     def caclulateDocumentProbabilityInClass(self, item, currentCategory):
  2873.         # zemi gi zborovite vo dokumentot item
  2874.         features = self.getfeatures(item)
  2875.         # pomnozi gi verojatnostite na site zborovi
  2876.         p = 1
  2877.         for currentFeature in features: p *= self.weightedprob(currentFeature, currentCategory,
  2878.                                                                self.getFeaturePerCategoryProbability)
  2879.  
  2880.         return p
  2881.  
  2882.     # Ja vrakja verojatnosta na klasata ako e poznat dokumentot
  2883.     def getCategoryProbabilityForDocument(self, item, currentCategory):
  2884.         catprob = self.getCategoryCount(currentCategory) / self.getTotalCount()
  2885.         caclulateDocumentProbabilityInClass = self.caclulateDocumentProbabilityInClass(item, currentCategory)
  2886.         # Bayes Theorem
  2887.         return caclulateDocumentProbabilityInClass * catprob / (1.0 / self.getTotalCount())
  2888.  
  2889.     # klasificiranje na dokument
  2890.     def classifyDocument(self, item, default=None):
  2891.         probs = {}
  2892.         # najdi ja kategorijata (klasata)
  2893.         # so najgolema verojatnost
  2894.         max = 0.0
  2895.         for cat in self.categories():
  2896.             probs[cat] = self.getCategoryProbabilityForDocument(item, cat)
  2897.             if probs[cat] > max:
  2898.                 max = probs[cat]
  2899.                 best = cat
  2900.  
  2901.         # proveri dali verojatnosta e pogolema od
  2902.         # threshold*next best (sledna najdobra)
  2903.         for cat in probs:
  2904.             if cat == best: continue
  2905.             if probs[cat] * self.getThreshold(best) > probs[best]: return default
  2906.  
  2907.         return best,round(math.log(max,2),4)
  2908.  
  2909.  
  2910. if __name__ == '__main__':
  2911.  
  2912.     #recenica = input()
  2913.     recenica = """Just last week, preservationists at the Old Pejeta animal sanctuary in Kenya conceded that their one male and two female northern white rhinos will not reproduce naturally. The animals were flown from the Czech zoo to the Kenyan conservancy in December 2009 in hopes that the natural environment could be easier for them to breed there than in captivity."""
  2914.  
  2915.  
  2916.     cl = naivebayes(getwords)
  2917.  
  2918.     for i in train_data:
  2919.         cl.train(i[0],i[1])
  2920.  
  2921.     class1, verojatnost1 = cl.classifyDocument(recenica)
  2922.  
  2923.  
  2924.     cl1 = naivebayes(getwords2)
  2925.     for i in train_data:
  2926.         cl1.train(i[0],i[1])
  2927.  
  2928.     class2, verojatnost2 = cl1.classifyDocument(recenica)
  2929.  
  2930.  
  2931.     print class1, verojatnost1
  2932.     print class2, verojatnost2
  2933.     print "%.4f" % round((verojatnost2 / verojatnost1), 4)
  2934.  
  2935.     if class1 != class2:
  2936.         print("kontradikcija")
  2937. -----------------------------------------------------------------------------------------------------
  2938. """ Klasifikacija - Twiter
  2939.  
  2940. Потребно е да се направи систем кој ќе знае да класифицира твитови во однос на тонот (sentiment) на позитивен и негативен.
  2941. Дадена ви е листа train_data од торки. Прв елемент во торката е класата (positive/negative) и втор елемент е содржината на твитот.
  2942. Користејќи ги првите 200 примери, да се изгради наивен Баесов класификатор кој ќе научи да класифицира непознати твитови.
  2943.  
  2944. Потоа, за прочитан индекс од влезот (број од 200 до 999) да се најде твитот на соодветната позиција во train_data и истиот да се класифицира.
  2945. Во првата линија се печати бројот на позитивни и негативни примери во тренинг множеството,
  2946. а во втората линија се печати индексот на тест примерот (прочитано од влез), точната класа, предвидената класа и содржината на твитот.
  2947.  
  2948. """
  2949.  
  2950. def getwords(doc):
  2951.     # regularen izraz koj ke go deli stringot na zborovi
  2952.     # stringot se deli na zborovi na site prazni mesta i interpunkciski znaci
  2953.     splitter = re.compile('\\W*')
  2954.     # podeli go dokumentot na zborovi
  2955.     # i konvertiraj gi vo mali bukvi
  2956.     # pa potoa stavi gi vo lista
  2957.     # ako nivnata dolzina e >2 i <20
  2958.     words = set()
  2959.     for word in splitter.split(doc):
  2960.         if 2 < len(word) < 20:
  2961.             words.add(word.lower())
  2962.     return words
  2963.     # words = [word.lower() for word in splitter.split(doc) if len(word) > 2 and len(word) < 20]
  2964.     # # se vrakja recnik cii klucevi se zborovite koi
  2965.     # # se vo dokumentot, a eden zbor duri i da se
  2966.     # # srekjava poveke pati vo recnikot ke bide samo ednas
  2967.     # # vrednosta (1) vo recnikot ne e potrebna
  2968.     # return dict([(word, 1) for word in words])
  2969.  
  2970.  
  2971. # {'python': {'bad': 0, 'good': 6}, 'the': {'bad': 3, 'good': 3}}
  2972.  
  2973.  
  2974. # print(w)
  2975. # exit(1)
  2976.  
  2977. class documentClassifier:
  2978.     def __init__(self, getfeatures, filename=None):
  2979.         # Broj na parovi atribut/kategorija (feature/category)
  2980.         self.featureCountsPerCategory = {}
  2981.         # Broj na dokumenti vo sekoja kategorija
  2982.         self.categoryCounts = {}
  2983.         # funkcija za dobivanje na atributite (zborovite) vo dokumentot
  2984.         self.getfeatures = getfeatures
  2985.  
  2986.     # Zgolemuvanje na brojot na parovi atribut/kategorija
  2987.     def incrementFeatureCountsPerCategory(self, currentFeature, currentCategory):
  2988.         self.featureCountsPerCategory.setdefault(currentFeature, {})
  2989.         self.featureCountsPerCategory[currentFeature].setdefault(currentCategory, 0)
  2990.         self.featureCountsPerCategory[currentFeature][currentCategory] += 1
  2991.  
  2992.     # Zgolemuvanje na brojot na predmeti(dokumenti) vo kategorija
  2993.     def incrementCategoryCounts(self, cat):
  2994.         self.categoryCounts.setdefault(cat, 0)
  2995.         self.categoryCounts[cat] += 1
  2996.  
  2997.     # Dobivanje na brojot kolku pati
  2998.     # odreden atribut se ima pojaveno vo odredena kategorija
  2999.     def getFeatureCountsPerCategory(self, currentFeature, currentCategory):
  3000.         if currentFeature in self.featureCountsPerCategory and currentCategory in self.featureCountsPerCategory[
  3001.             currentFeature]:
  3002.             return float(self.featureCountsPerCategory[currentFeature][currentCategory])
  3003.         return 0.0
  3004.  
  3005.     # Dobivanje na brojot na predmeti(dokumenti) vo kategorija
  3006.     def getCategoryCount(self, currentCategory):
  3007.         if currentCategory in self.categoryCounts:
  3008.             return float(self.categoryCounts[currentCategory])
  3009.         return 0
  3010.  
  3011.     # Dobivanje na vkupniot broj na predmeti
  3012.     def getTotalCount(self):
  3013.         return sum(self.categoryCounts.values())
  3014.  
  3015.     # Dobivanje na lista na site kategorii
  3016.     def categories(self):
  3017.         return self.categoryCounts.keys()
  3018.  
  3019.     # Treniranje na klasifikatorot
  3020.     # Noviot predmet (dokument) item pripagja na kategorijata cat
  3021.     def train(self, item, currentCategory):
  3022.         # Se zemaat atributite (zborovite) vo predmetot(dokumentot)
  3023.         features = self.getfeatures(item)
  3024.         # Se zgolemuva brojot na sekoj atribut vo ovaa kategorija
  3025.         for currentFeature in features:
  3026.             self.incrementFeatureCountsPerCategory(currentFeature, currentCategory)
  3027.  
  3028.         # Se zgolemuva brojot na predmeti (dokumenti) vo ovaa kategorija
  3029.         self.incrementCategoryCounts(currentCategory)
  3030.  
  3031.     def getFeaturePerCategoryProbability(self, currentFeature, currentCategory):
  3032.         if self.getCategoryCount(currentCategory) == 0: return 0
  3033.         # Verojatnosta e vkupniot broj na pati koga
  3034.         # ovoj atribut f (zbor) se pojavil vo ovaa
  3035.         # kategorija podeleno so vkupniot broj na
  3036.         # predmeti (dokumenti) vo ovaa kategorija
  3037.         return self.getFeatureCountsPerCategory(currentFeature, currentCategory) / self.getCategoryCount(
  3038.             currentCategory)
  3039.  
  3040.     def weightedprob(self, currentFeature, currentCategory, prf, weight=1.0, ap=0.5):
  3041.         # Presmetaj ja osnovnata verojatnost
  3042.         basicprob = prf(currentFeature, currentCategory)
  3043.         # Izbroj kolku pati se ima pojaveno ovoj atribut (zbor)
  3044.         # vo site kategorii
  3045.         totals = sum([self.getFeatureCountsPerCategory(currentFeature, currentCategory) for currentCategory in
  3046.                       self.categories()])
  3047.         # Presmetaj ja tezinski usrednetata verojatnost
  3048.         bp = ((weight * ap) + (totals * basicprob)) / (weight + totals)
  3049.         return bp
  3050.  
  3051.  
  3052. #dc = documentClassifier(getwords)
  3053. #dc.train("sistemi na znaenje e dosaden predmet", "tracevi")
  3054. #dc.train("asistentot po sistemi na znaenje e isto taka dosaden", "tracevi")
  3055. #dc.train("vezbite po sistemi na znaenje moze da se podobrat na sledniov nacin...", "kritiki")
  3056. #dc.train("predvanjata po sistemi na znaenje ne moze da se podobrat bidejki se najdobri...", "kritiki")
  3057.  
  3058.  
  3059. class naivebayes(documentClassifier):
  3060.     def __init__(self, getfeatures):
  3061.         documentClassifier.__init__(self, getfeatures)
  3062.         self.thresholds = {}
  3063.  
  3064.     def setThreshold(self, currentCategory, threshold):
  3065.         self.thresholds[currentCategory] = threshold
  3066.  
  3067.     def getThreshold(self, currentCategory):
  3068.         if currentCategory not in self.thresholds: return 1.0
  3069.         return self.thresholds[currentCategory]
  3070.  
  3071.     # ja vrakja verojatnosta na dokumentot da e od klasata cat (cat e odnapred poznata)
  3072.     def caclulateDocumentProbabilityInClass(self, item, currentCategory):
  3073.         # zemi gi zborovite vo dokumentot item
  3074.         features = self.getfeatures(item)
  3075.         # pomnozi gi verojatnostite na site zborovi
  3076.         p = 1
  3077.         for currentFeature in features:
  3078.             p *= self.weightedprob(currentFeature, currentCategory,
  3079.                                    self.getFeaturePerCategoryProbability)
  3080.  
  3081.         return p
  3082.  
  3083.     # Ja vrakja verojatnosta na klasata ako e poznat dokumentot
  3084.     def getCategoryProbabilityForDocument(self, item, currentCategory):
  3085.         catprob = self.getCategoryCount(currentCategory) / self.getTotalCount()
  3086.         caclulateDocumentProbabilityInClass = self.caclulateDocumentProbabilityInClass(item, currentCategory)
  3087.         # Bayes Theorem
  3088.         return caclulateDocumentProbabilityInClass * catprob / (1.0 / self.getTotalCount())
  3089.  
  3090.     # klasificiranje na dokument
  3091.     def classifyDocument(self, item, default=None):
  3092.         probs = {}
  3093.         # najdi ja kategorijata (klasata)
  3094.         # so najgolema verojatnost
  3095.         max = 0.0
  3096.         for cat in self.categories():
  3097.             probs[cat] = self.getCategoryProbabilityForDocument(item, cat)
  3098.             if probs[cat] > max:
  3099.                 max = probs[cat]
  3100.                 best = cat
  3101.  
  3102.         # proveri dali verojatnosta e pogolema od
  3103.         # threshold*next best (sledna najdobra)
  3104.         for cat in probs:
  3105.             if cat == best: continue
  3106.             if probs[cat] * self.getThreshold(best) > probs[best]: return default
  3107.  
  3108.         return best
  3109.  
  3110.  
  3111.  
  3112. if __name__ == '__main__':
  3113.  
  3114.     cl = naivebayes(getwords)
  3115.  
  3116.     #index = input()
  3117.     index=250
  3118.  
  3119.     pozitive = 0
  3120.     negative = 0
  3121.  
  3122.     for i in range(200):
  3123.         #print train_data[i][1],train_data[i][0], i
  3124.         if train_data[i][0] == 'negative':
  3125.             negative+=1
  3126.         else:
  3127.             pozitive+=1
  3128.  
  3129.         cl.train(train_data[i][1], train_data[i][0])
  3130.         #break
  3131.  
  3132.     twit = train_data[index]
  3133.  
  3134.     klasifikacija = cl.classifyDocument(twit[1])
  3135.  
  3136.     print "Pozitivni: {}, Negativni: {}".format(pozitive, negative)
  3137.     print "Index: {}, Tocna Klasa: {}, Predvidena Klasa: {}, Twit: {}".format(index, tweet[0], klasifikacija, twit[1])
  3138.  
  3139. ------------------------------------------------------------------------------------------------------
  3140. #Пресметка на статистики и еден подвижен прозорец Problem 1 (2 / 2)
  3141. За даденото податочно множество во листата X_data со предефинирана должина N од секоја
  3142. колона треба да се пресметаат следниве статистики: минимум, максимум, средна вредност,
  3143. стандардна девијација. Притоа од стандарден влез се чита поместувањето D и должнината на
  3144. подвижниот прозорец L. На излез треба да се испечати испроцесираното множество, така што
  3145. во секоја линија ќе се испечати бројот на редицата со која завршува подвижниот прозорец
  3146. и листа од вредности заокружени на 2 децимали со бараните статистики од секоја колона
  3147. (прво 4 статистики за првата колона, па 4 статистики за втората колона итн...)
  3148. Вкупниот број на редови е floor((N-L)/D + 1).
  3149.  
  3150. НАПОМЕНА: Заради ефикасност на решението треба да ги пресметувате само статистиките кои се бараат.
  3151.  
  3152.  
  3153. from __future__ import print_function
  3154. import numpy as np
  3155. import scipy.stats as sp
  3156.  
  3157. X_data = [[119, 57, 51, 3, 1, 141],
  3158.           [105, 54, 62, 3, 1, 133],
  3159. .........................]]
  3160.          
  3161. .........
  3162.  
  3163. def percentiles_all(x, iqr=True, amplitude=True, percentiles_list=[5, 10, 25, 40, 50, 60, 75, 90, 95]):
  3164.     names = ['p_' + str(p) for p in percentiles_list]
  3165.     if iqr and 25 in percentiles_list and 75 in percentiles_list:
  3166.         names.append('iqr')
  3167.     if amplitude and 1 in percentiles_list and 99 in percentiles_list:
  3168.         names.append('perc_amp')
  3169.     if len(x) == 0:
  3170.         values = [0 for i in range(len(names))]
  3171.         return values, names
  3172.     if len(percentiles_list) > 0 and all([0 < q < 100 for q in percentiles_list]):
  3173.         values = list(np.percentile(x, percentiles_list))
  3174.     else:
  3175.         values = []
  3176.     if iqr:
  3177.         q1 = percentiles_list.index(25)
  3178.         q3 = percentiles_list.index(75)
  3179.         values.append(values[q3] - values[q1])
  3180.     if amplitude and 1 in percentiles_list and 99 in percentiles_list:
  3181.         q1 = percentiles_list.index(1)
  3182.         q3 = percentiles_list.index(99)
  3183.         values.append(values[q3] - values[q1])
  3184.         return values, names
  3185.  
  3186.  
  3187. def stats_calculate_all(X_data):
  3188.     stats_all_names = stats_all_names = ['len', 'min', 'max', 'range', 'mean', 'hmean', 'gmean', 'var', 'std', 'skew',
  3189.                                          'kurtosis', 'median', 'mode', 'energy', 'energy_sample', 'snr']
  3190.  
  3191.     xnp = np.array(X_data)
  3192.     n = len(X_data)
  3193.     if n == 0:
  3194.         values = [0 for i in range(len(stats_all_names))]
  3195.         return values, stats_all_names
  3196.     values = []
  3197.     vmin = float(min(xnp))
  3198.     if vmin < 1:
  3199.         offset = 1 + abs(vmin)
  3200.     else:
  3201.         offset = 0
  3202.     vmax = float(max(xnp))
  3203.     #vrange = vmax - vmin
  3204.     vmean = float(np.mean(xnp))
  3205.     vstd = float(np.std(xnp))
  3206.     #venergy = float(sum(np.array(xnp) ** 2))
  3207.     #venergy_sample = venergy / n
  3208.     #snr = 0.0
  3209.     #if vstd != 0: snr = vmean / vstd
  3210.     values.append(vmin)
  3211.     values.append(vmax)
  3212.     #values.append(vrange)
  3213.     values.append(vmean)
  3214.     # values.append(float(sp.hmean(xnp + offset)))
  3215.     # values.append(float(sp.gmean(xnp + offset)))
  3216.     # values.append(vstd ** 2)
  3217.     values.append(vstd)
  3218.     # values.append(sp.skew(xnp))
  3219.     # values.append(sp.kurtosis(xnp))
  3220.     #values.append(np.median(xnp))
  3221.     # vmode = sp.mode(xnp)
  3222.     # vmode = float(vmode[0][0])
  3223.     # values.append(vmode)
  3224.     # values.append(venergy)
  3225.     # values.append(venergy_sample)
  3226.     #values.append(snr)
  3227.     return values, stats_all_names
  3228.  
  3229.  
  3230. if __name__ == "__main__":
  3231.      x = np.array(X_data)
  3232.  
  3233.     # vasiot kod tuka
  3234.      shift=input()
  3235.      w_long=input()
  3236.      #print(x.shape)
  3237.      #print(x[:10, 0].shape)
  3238.      #print(x[:10, 0])
  3239.      #print(sorted(x[:10, 0]))
  3240.      #print(stats_calculate_all(x[:10, 0]))
  3241.      #print(percentiles_all(x[:10, 0], iqr=False, amplitude=False, percentiles_list=[10, 20, 50, 70, 90]))
  3242.      #shift = 50
  3243.      #w_long=100
  3244.      #w_short = 20
  3245.      for i in range(w_long, len(X_data), shift):
  3246.         row=[]
  3247.         for j in range(x.shape[1]):
  3248.             #x.shape[1]
  3249.             x_winow_long = x[i - w_long:i,j]
  3250.             #x_winow_short = x[i - w_short:i, j]
  3251.             #p1 = percentiles_all(x_winow_long)
  3252.             #p2 = percentiles_all(x_winow_short)
  3253.             s1, stat_names = stats_calculate_all(x_winow_long)
  3254.             #s2, _ = stats_calculate_all(x_winow_short)
  3255.             #s2=round((s1),2)
  3256.             #long_MVA = s1[4]
  3257.             row+=s1
  3258.             #print(s1,row)
  3259.             #short_MVA = s2[4]
  3260.         row3=[round(r,2) for r in row]
  3261.         print(i,row3)
  3262.  
  3263. ---------------------------------------------------------------------------------------------------
  3264. #Пресметка на статистики и два подвижни прозорци Problem 2 (1 / 2)
  3265. За даденото податочно множество во листата X_data со предефинирана должина N од секоја колона т
  3266. реба да се пресмета средната вредност. Притоа од стандарден влез се чита должнината на долгиот
  3267. подвижнен прозорец L1, и краткиот подвижнен прозорец L2. Поместувањето е фиксно и е 1 ред.
  3268. На излез треба да се испечати испроцесираното множество, така што во секоја линија ќе се
  3269. испечати бројот на редицата со која завршува подвижниот прозорец и листа од вредности за
  3270. секоја колона заокружени на 2 децимали: тековната вредност, средната вредност во долгиот
  3271. подвижен прозорец, средна вредност во краткиот подвижен прозорец, и разлика од двете средни вредности.
  3272. За M колони во секој ред треба да се испечатат листа со M x 4 елементи
  3273. (тековна вредност, средна вредност долг подвижен прозорец, средна вредност краток
  3274. подвижен прозорец, разлика од средните вредности). Заокружувањето се прави при печатењето на
  3275. вредностите, но тие се чуваат без заокружување.
  3276.  
  3277. НАПОМЕНА: Заради ефикасност на решението треба да ги пресметувате само статистиките кои се бараат.
  3278.  
  3279. def stats_calculate_all(x):
  3280.     """
  3281.    Calculates stats from the provided list xnp, based on the stats config object.
  3282.    :param stat_config: Configuration for which stats to be computed.
  3283.    :param x: The time series list.
  3284.    :param offset: Offset of the values so some stats can be calculated
  3285.    :return:
  3286.    """
  3287.     stats_all_names = ['len', 'min', 'max', 'range', 'mean', 'hmean', 'gmean', 'var', 'std', 'skew', 'kurtosis',
  3288.                        'median', 'mode', 'energy', 'energy_sample', 'snr']
  3289.  
  3290.     xnp = np.array(x)
  3291.     n = len(x)
  3292.     if n == 0:
  3293.         values = [0 for i in range(len(stats_all_names))]
  3294.         return values, stats_all_names
  3295.     values = [n]
  3296.     vmin = float(min(xnp))
  3297.     if vmin < 1:
  3298.         offset = 1 + abs(vmin)
  3299.     else:
  3300.         offset = 0
  3301.     vmax = float(max(xnp))
  3302.     vrange = vmax - vmin
  3303.     vmean = float(np.mean(xnp))
  3304.     vstd = float(np.std(xnp))
  3305.    
  3306.     values.append(vmin)
  3307.     values.append(vmax)
  3308.     values.append(vrange)
  3309.     values.append(vmean)
  3310.     values.append(vstd)
  3311.    
  3312.     return values, stats_all_names
  3313.  
  3314. if __name__ == "__main__":
  3315.     x = np.array(X_data)
  3316.     w_long = input()
  3317.     w_short = input()
  3318.     shift = 1
  3319.     arr_len = len(x[0, :])
  3320.    
  3321.     for i in range(max(w_short, w_long), len(X_data), shift):
  3322.         result = []
  3323.         for j in range(0, arr_len):
  3324.            
  3325.             x_winow_long = x[i - w_long:i, j]
  3326.             x_winow_short = x[i - w_short:i, j]
  3327.             s1, stat_names = stats_calculate_all(x_winow_long)
  3328.             s2, _ = stats_calculate_all(x_winow_short)
  3329.             long_MVA = s1[4]  # Moving Average long window
  3330.             short_MVA = s2[4]  # Moving Average short window
  3331.             # broj na primerok, broj na kolona, vrednost na primerokot i kolonata
  3332.             # razlika od tekovna vrednost i prosek od dolg prozorec
  3333.             # razlika od tekovna vrednost i prosek od kratok prozorec
  3334.             # razlika od pomegu prosek vo dolg i kratok prozorec
  3335.             result.extend([round(x[i, j], 2), round(long_MVA, 2), round(short_MVA, 2), round(long_MVA - short_MVA, 2)])
  3336.         print(i,result)
  3337. -------------------------------------------------------------------------------------------------------
  3338. # Vremenski prozorci - ispit januari 2017 - courses
  3339.  
  3340. За даденото податочно множество во листата X_data со предефинирана должина N од секоја колона треба да се пресмета средната вредност, медијаната и стандардната девијација.
  3341. Притоа од стандарден влез се чита должнината на долгиот подвижнен прозорец L1, и краткиот подвижнен прозорец L2.
  3342. Поместувањето е фиксно и е 5 реда. На излез треба да се испечати процесираното множество, така што во секоја линија
  3343. ќе се испечати бројот на редицата со која завршува подвижниот прозорец и листа од вредности за секоја колона заокружени
  3344. на 2 децимали: тековната вредност, средната вредност, медијаната и стандардната девијација во долгиот подвижен прозорец,
  3345. средната вредност, медијаната и стандардната девијација во краткиот подвижен прозорец. Заокружувањето се прави при печатењето на вредностите,
  3346. но тие се чуваат без заокружување. Дополнително, доколку средната вредност од долгиот е поголема од средната вредност од краткиот прозорец,
  3347. за секоја колона да се додаде вредност 1, а доколку средната вредност од краткиот е поголема од средната вредност од долгиот, да се додаде вредност -1.
  3348.  
  3349.  
  3350. y_labels = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  3351.             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  3352.             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  3353.             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  3354.             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  3355.             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  3356.             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  3357.             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  3358.             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  3359.             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  3360.             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  3361.             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  3362.             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  3363.             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  3364.             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  3365.             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  3366.             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  3367.             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  3368.             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
  3369.  
  3370.  
  3371. def percentiles_all(x, iqr=True, amplitude=True, percentiles_list=[5, 10, 25, 40, 50, 60, 75, 90, 95]):
  3372.     names = ['p_' + str(p) for p in percentiles_list]
  3373.     if iqr and 25 in percentiles_list and 75 in percentiles_list:
  3374.         names.append('iqr')
  3375.     if amplitude and 1 in percentiles_list and 99 in percentiles_list:
  3376.         names.append('perc_amp')
  3377.     if len(x) == 0:
  3378.         values = [0 for i in range(len(names))]
  3379.         return values, names
  3380.     if len(percentiles_list) > 0 and all([0 < q < 100 for q in percentiles_list]):
  3381.         values = list(np.percentile(x, percentiles_list))
  3382.     else:
  3383.         values = []
  3384.     if iqr:
  3385.         q1 = percentiles_list.index(25)
  3386.         q3 = percentiles_list.index(75)
  3387.         values.append(values[q3] - values[q1])
  3388.     if amplitude and 1 in percentiles_list and 99 in percentiles_list:
  3389.         q1 = percentiles_list.index(1)
  3390.         q3 = percentiles_list.index(99)
  3391.         values.append(values[q3] - values[q1])
  3392.     return values, names
  3393.  
  3394. def stats_calculate_all(x):
  3395.     xnp = np.array(x)
  3396.     n = len(x)
  3397.     if n == 0:
  3398.         values = [0 for i in range(len(stats_all_names))]
  3399.         return values, stats_all_names
  3400.     values = []
  3401.     vmean = float(np.mean(xnp))
  3402.     vstd = float(np.std(xnp))
  3403.     if vstd != 0:
  3404.         snr = vmean / vstd
  3405.     values.append(vmean)
  3406.     values.append(np.median(xnp))
  3407.     values.append(vstd)
  3408.     return values
  3409.  
  3410.  
  3411. if __name__ == "__main__":
  3412.     x = np.array(X_data)
  3413.     shift = 5  # pomestuvanje pomegu podviznite prozorci
  3414.     w_long = input()  # dolzina (broj na otcituvanja) na dolgiot prozorec
  3415.     w_short = input()  # dolzina (broj na otcituvanja) na kratkiot prozorec
  3416.     for i in range(max(w_short, w_long), len(X_data), shift):
  3417.         lista=[]
  3418.         nova=[]
  3419.         for j in range(x.shape[1]):
  3420.             x_winow_long = x[i - w_long:i, j]
  3421.             x_winow_short = x[i - w_short:i, j]
  3422.             s1 = stats_calculate_all(x_winow_long)
  3423.             s2= stats_calculate_all(x_winow_short)
  3424.             lista.append(round((x[i][j]),2))
  3425.             for d in s1:
  3426.                 lista.append(round(d,2))
  3427.             for d in s2:
  3428.                 lista.append(round(d,2))
  3429.             if(s1[0]<s2[0]):
  3430.                 lista.append(-1)
  3431.             else:
  3432.                 lista.append(1)
  3433.         print (i,lista)
  3434.  
  3435. ------------------------------------------------------------------------------------------------------
  3436. ------------------------------------------------------------------------------------------------------
  3437. Sistemi za preporaka - 1
  3438.  
  3439. Оцени на корисници и филмови Problem 1 (1 / 2)
  3440. За даденото множество кое е претставено како речник чиј клуч е името на
  3441. корисникот и вредност е речник чиј клуч е филмот, а вредност е оцената
  3442. која корисникот ја дал за филмот, да се инвертира така што ќе добиете
  3443. повторно речник од речници. Во новиот речник клуч е името на филмот,
  3444. а вредност е речник чиј клуч е името на корисникот, а вредност е оцената
  3445. која тој корисник ја дал за тековниот филм.
  3446.  
  3447. Потоа за прочитано име на филм од стандарден влез да се испечати најмалата и најголемата
  3448. оцена која е дадена за него.
  3449.  
  3450. Sample input
  3451. 'Catch Me If You Can'
  3452. Sample output
  3453. {'Lisa Rose': 3.0, 'Jack Matthews': 4.5, 'Michael Phillips': 2.5, 'Gary Coleman': 1.5, 'Michelle Nichols': 2.5}
  3454.  
  3455.  
  3456. oceniPoKorisnici={
  3457.     'Lisa Rose': {'Catch Me If You Can': 3.0 , 'Snakes on a Plane': 3.5, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 'The Night Listener': 3.0, 'Snitch': 3.0},
  3458.     'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 'Just My Luck': 1.5,  'The Night Listener': 3.0,'You, Me and Dupree': 3.5},
  3459.     'Michael Phillips': {'Catch Me If You Can': 2.5, 'Lady in the Water': 2.5,'Superman Returns': 3.5, 'The Night Listener': 4.0, 'Snitch': 2.0},
  3460.     'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,'The Night Listener': 4.5, 'Superman Returns': 4.0,'You, Me and Dupree': 2.5},
  3461.     'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'Just My Luck': 2.0, 'Superman Returns': 3.0, 'You, Me and Dupree': 2.0},
  3462.     'Jack Matthews': {'Catch Me If You Can': 4.5, 'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5, 'Snitch': 4.5},
  3463.     'Toby': {'Snakes on a Plane':4.5, 'Snitch': 5.0},
  3464.     'Michelle Nichols': {'Just My Luck' : 1.0, 'The Night Listener': 4.5, 'You, Me and Dupree': 3.5, 'Catch Me If You Can': 2.5, 'Snakes on a Plane': 3.0},
  3465.     'Gary Coleman': {'Lady in the Water': 1.0, 'Catch Me If You Can': 1.5, 'Superman Returns': 1.5, 'You, Me and Dupree': 2.0},
  3466.     'Larry': {'Lady in the Water': 3.0, 'Just My Luck': 3.5, 'Snitch': 1.5, 'The Night Listener': 3.5}
  3467.     }
  3468.  
  3469. def invertirajOceni(oceni):
  3470.     oceniPoFilmovi={}
  3471.     for person in oceni:
  3472.         for item in oceni[person]:
  3473.             oceniPoFilmovi.setdefault(item,{})
  3474.             oceniPoFilmovi[item][person]=oceni[person][item]
  3475.            
  3476.            
  3477.     return oceniPoFilmovi
  3478.  
  3479. if __name__ == "__main__":
  3480.     oceniPoFilmovi=invertirajOceni(oceniPoKorisnici)
  3481.  
  3482.     film=input()
  3483.  
  3484.     print oceniPoFilmovi[film]
  3485. -----------------------------------------------------------------------------------------------------
  3486. Sistemi za preporaka - 2
  3487.  
  3488. Opened: 196 дена
  3489. Функции за сличност Problem 2 (1 / 2)
  3490. Да се напишат функции за пресметување на сличност базирани на Пеарсонова корелација и
  3491. Евклидово растојание кои ќе враќаат торка од сличноста и бројот на заеднички елементи.
  3492. За прочитани имиња на двајца корисници да се испечатата торките што ги враќаат двете функции.
  3493. Sample input
  3494. 'Jack Matthews'
  3495. 'Gene Seymour'
  3496. Sample output
  3497. (0.905, 4)
  3498. (0.667, 4)
  3499.  
  3500. import math
  3501.  
  3502. oceniPoKorisnici={
  3503.     'Lisa Rose': {'Catch Me If You Can': 3.0 , 'Snakes on a Plane': 3.5, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 'The Night Listener': 3.0, 'Snitch': 3.0},
  3504.     'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 'Just My Luck': 1.5,  'The Night Listener': 3.0,'You, Me and Dupree': 3.5},
  3505.     'Michael Phillips': {'Catch Me If You Can': 2.5, 'Lady in the Water': 2.5,'Superman Returns': 3.5, 'The Night Listener': 4.0, 'Snitch': 2.0},
  3506.     'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,'The Night Listener': 4.5, 'Superman Returns': 4.0,'You, Me and Dupree': 2.5},
  3507.     'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'Just My Luck': 2.0, 'Superman Returns': 3.0, 'You, Me and Dupree': 2.0},
  3508.     'Jack Matthews': {'Catch Me If You Can': 4.5, 'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5, 'Snitch': 4.5},
  3509.     'Toby': {'Snakes on a Plane':4.5, 'Snitch': 5.0},
  3510.     'Michelle Nichols': {'Just My Luck' : 1.0, 'The Night Listener': 4.5, 'You, Me and Dupree': 3.5, 'Catch Me If You Can': 2.5, 'Snakes on a Plane': 3.0},
  3511.     'Gary Coleman': {'Lady in the Water': 1.0, 'Catch Me If You Can': 1.5, 'Superman Returns': 1.5, 'You, Me and Dupree': 2.0},
  3512.     'Larry': {'Lady in the Water': 3.0, 'Just My Luck': 3.5, 'Snitch': 1.5, 'The Night Listener': 3.5}
  3513.     }
  3514.  
  3515. def sim_distance(oceni,person1,person2):
  3516.     # Se pravi lista na zaednicki predmeti (filmovi)
  3517.     zaednicki={}
  3518.     for item in oceni[person1].keys():
  3519.         if item in oceni[person2]:
  3520.             zaednicki[item]=1
  3521.     # ako nemaat zaednicki rejtinzi, vrati 0
  3522.     if len(zaednicki)==0: return 0
  3523.     # Soberi gi kvadratite na zaednickite razliki
  3524.     sum_of_squares=sum([pow(oceni[person1][item]-oceni[person2][item],2)
  3525.         for item in oceni[person1] if item in oceni[person2]])
  3526.     return (round(1/(1+math.sqrt(sum_of_squares)),3),len(zaednicki))
  3527.  
  3528. def sim_pearson(oceni,p1,p2):
  3529.     #for math import sqrt
  3530.     # Se kreira recnik vo koj ke se cuvaat predmetite (filmovi) koi se oceneti od dvajcata
  3531.     # Vo recnikot ni se vazni samo klucevite za da gi cuvame iminjata na filmovite koi se zaednicki, a vrednostite ne ni se vazni
  3532.     zaednicki={}
  3533.     for item in oceni[p1]:
  3534.         if item in oceni[p2]: zaednicki[item]=1
  3535.  
  3536.     # Se presmetuva brojot na predmeti oceneti od dvajcata
  3537.     n=len(zaednicki)
  3538.  
  3539.     # Ako nemaat zaednicki predmeti vrati korelacija 0
  3540.     if n==0: return 0
  3541.  
  3542.     # Soberi gi zaednickite oceni (rejtinzi) za  sekoja licnost posebno
  3543.     sum1=sum([oceni[p1][it] for it in zaednicki])
  3544.     sum2=sum([oceni[p2][it] for it in zaednicki])
  3545.  
  3546.     # Soberi gi kvadratite od zaednickite oceni (rejtinzi) za  sekoja licnost posebno
  3547.     sum1Sq=sum([pow(oceni[p1][it],2) for it in zaednicki])
  3548.     sum2Sq=sum([pow(oceni[p2][it],2) for it in zaednicki])
  3549.  
  3550.     # Soberi gi proizvodite od ocenite na dvete licnosti
  3551.     pSum=sum([oceni[p1][it]*oceni[p2][it] for it in zaednicki])
  3552.  
  3553.     # Presmetaj go koeficientot na korelacija
  3554.     num=pSum-(sum1*sum2/n)
  3555.     den=math.sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
  3556.     if den==0: return 0
  3557.     r=num/den
  3558.     return (round(r,3),n)
  3559.  
  3560. if __name__ == "__main__":
  3561.  
  3562.     korisnik1=input()
  3563.     korisnik2=input()
  3564.     # korisnik1='Mick LaSalle'
  3565.     # korisnik2='Lisa Rose'
  3566.     print sim_pearson(oceniPoKorisnici, korisnik1, korisnik2)
  3567.     print sim_distance(oceniPoKorisnici, korisnik1, korisnik2)
  3568.  
  3569. ------------------------------------------------------------------------------------------------------
  3570. Sistemi za preporaka - 3
  3571.  
  3572. Табела на слични корисници Problem 3 (1 / 2)
  3573. Да се напише функција која ќе генерира табела на слични корисници претставена како речник од речници
  3574. (клучеви се имињата на корисниците), така што за секој пар корисници ќе чува торка од сличност
  3575. базирана на Пеарсонова корелација, сличност базирана на Евклидово растојание, и број на заеднички
  3576. оцени (оцени дадени за исти филмови). Вредностите да бидат заокружени на 3 децимали. За прочитани
  3577. имиња на двајца корисници да се испечати торката која се чува во генерираната табела.
  3578. Sample input
  3579. 'Larry'
  3580. 'Gene Seymour'
  3581. Sample output
  3582. (0.327, -0.5, 3)
  3583.  
  3584.  
  3585. from math import sqrt
  3586. oceniPoKorisnici={
  3587.     'Lisa Rose': {'Catch Me If You Can': 3.0 , 'Snakes on a Plane': 3.5, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 'The Night Listener': 3.0, 'Snitch': 3.0},
  3588.     'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 'Just My Luck': 1.5,  'The Night Listener': 3.0,'You, Me and Dupree': 3.5},
  3589.     'Michael Phillips': {'Catch Me If You Can': 2.5, 'Lady in the Water': 2.5,'Superman Returns': 3.5, 'The Night Listener': 4.0, 'Snitch': 2.0},
  3590.     'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,'The Night Listener': 4.5, 'Superman Returns': 4.0,'You, Me and Dupree': 2.5},
  3591.     'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'Just My Luck': 2.0, 'Superman Returns': 3.0, 'You, Me and Dupree': 2.0},
  3592.     'Jack Matthews': {'Catch Me If You Can': 4.5, 'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5, 'Snitch': 4.5},
  3593.     'Toby': {'Snakes on a Plane':4.5, 'Snitch': 5.0},
  3594.     'Michelle Nichols': {'Just My Luck' : 1.0, 'The Night Listener': 4.5, 'You, Me and Dupree': 3.5, 'Catch Me If You Can': 2.5, 'Snakes on a Plane': 3.0},
  3595.     'Gary Coleman': {'Lady in the Water': 1.0, 'Catch Me If You Can': 1.5, 'Superman Returns': 1.5, 'You, Me and Dupree': 2.0},
  3596.     'Larry': {'Lady in the Water': 3.0, 'Just My Luck': 3.5, 'Snitch': 1.5, 'The Night Listener': 3.5}
  3597.     }
  3598. # Vrakja merka za slicnost bazirana na rastojanieto za person1 i person2
  3599. def sim_distance(oceni,person1,person2):
  3600.     si={}
  3601.     for item in oceni[person1]:
  3602.         if item in oceni[person2]:
  3603.             si[item]=1
  3604.     if len(si)==0:
  3605.         return 0
  3606.     sum_of_squares=sum([pow(oceni[person1][item]-oceni[person2][item],2)
  3607.                         for item in oceni[person1] if item in oceni[person2]])
  3608.     return (round(1.0/(1+sqrt(sum_of_squares)),3),len(si))
  3609.     return (0,0)
  3610. def sim_pearson(oceni,person1,person2):
  3611.     si={}
  3612.     for item in oceni[person1]:
  3613.         if item in oceni[person2]:
  3614.             si[item]=1
  3615.     n=len(si)
  3616.     if n==0: return 0
  3617.     sum1=sum([oceni[person1][it] for it in si])
  3618.     sum2=sum([oceni[person2][it] for it in si])
  3619.    
  3620.     sum1Sq=sum([pow(oceni[person1][it],2) for it in si])
  3621.     sum2Sq=sum([pow(oceni[person2][it],2) for it in si])
  3622.     pSum=sum([oceni[person1][it]*oceni[person2][it] for it in si])
  3623.     num=pSum-(sum1*sum2/n)
  3624.     den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
  3625.     if den==0:
  3626.         return 0
  3627.     r=num/den
  3628.     return (round(r,3),n)
  3629.    
  3630.    
  3631. def TabelaNaSlicniKorisnici(oceni):
  3632.     slicnosti={}
  3633.     for item in oceni:
  3634.         for key in oceni:
  3635.             if item!=oceni:
  3636.                 slicnosti.setdefault(item,{})
  3637.                 torka=sim_pearson(oceni,item,key)
  3638.                 torka1=sim_distance(oceni,item,key)
  3639.                 if torka!=0:
  3640.                     a=torka1[0]
  3641.                     b=torka[0]
  3642.                     c=torka[1]
  3643.                     slicnosti[item][key]=a,b,c
  3644.     return slicnosti
  3645. if __name__ == "__main__":
  3646.     korisnik1=input()
  3647.     korisnik2=input()
  3648.     # korisnik1='Mick LaSalle'
  3649.     # korisnik2='Lisa Rose'
  3650.     # print oceniPoKorisnici
  3651.     tabela=TabelaNaSlicniKorisnici(oceniPoKorisnici)
  3652.     # print tabela
  3653.     print tabela[korisnik1][korisnik2]
  3654.  
  3655. ------------------------------------------------------------------------------------------------------
  3656. Sistemi za preporaka
  3657. # -*- coding: utf-8 -*-
  3658.  
  3659. """
  3660. Да се напише функција која во зависност од бројот на рангирани филмови на корисникот
  3661. ќе одбира начинот на препорачување - дали item-based или user-based. Функцијата треба да прима аргумент
  3662. име на корисникот и бројот n од стандарден влез. Ако бројот на рангирани филмови на корисникот е помал од n
  3663. препорачува на со item-based начин, а ако е поголем или еднаков на n да препорачува на user-based начин. На излез да
  3664. се печати одбраниот начин (user-based или item-based), и во вториот ред да се испечати листа од препорачани филмови која
  3665. ги содржи само имињата сортирани во растечки (азбучен) редослед.
  3666.  
  3667. """
  3668.  
  3669.  
  3670. oceniPoKorisnici={
  3671.     'Lisa Rose': {'Catch Me If You Can': 3.0 , 'Snakes on a Plane': 3.5, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 'The Night Listener': 3.0, 'Snitch': 3.0},
  3672.     'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 'Just My Luck': 1.5,  'The Night Listener': 3.0,'You, Me and Dupree': 3.5},
  3673.     'Michael Phillips': {'Catch Me If You Can': 2.5, 'Lady in the Water': 2.5,'Superman Returns': 3.5, 'The Night Listener': 4.0, 'Snitch': 2.0},
  3674.     'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,'The Night Listener': 4.5, 'Superman Returns': 4.0,'You, Me and Dupree': 2.5},
  3675.     'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'Just My Luck': 2.0, 'Superman Returns': 3.0, 'You, Me and Dupree': 2.0},
  3676.     'Jack Matthews': {'Catch Me If You Can': 4.5, 'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5, 'Snitch': 4.5},
  3677.     'Toby': {'Snakes on a Plane':4.5, 'Snitch': 5.0},
  3678.     'Michelle Nichols': {'Just My Luck' : 1.0, 'The Night Listener': 4.5, 'You, Me and Dupree': 3.5, 'Catch Me If You Can': 2.5, 'Snakes on a Plane': 3.0},
  3679.     'Gary Coleman': {'Lady in the Water': 1.0, 'Catch Me If You Can': 1.5, 'Superman Returns': 1.5, 'You, Me and Dupree': 2.0},
  3680.     'Larry': {'Lady in the Water': 3.0, 'Just My Luck': 3.5, 'Snitch': 1.5, 'The Night Listener': 3.5}
  3681.     }
  3682.  
  3683. import math
  3684.  
  3685. # Vrakja merka za slicnost bazirana na rastojanieto za person1 i person2
  3686. def sim_distance(oceni, person1, person2):
  3687.     # Se pravi lista na zaednicki predmeti (filmovi)
  3688.  
  3689.     filmovi1=set(oceni[person1].keys())
  3690.     filmovi2=set(oceni[person2].keys())
  3691.     zaednicki = filmovi1.intersection(filmovi2)
  3692. #     print(filmovi1)
  3693. #     print(filmovi2)
  3694. #     print(zaednicki)
  3695. #     for item in oceni[person1].keys():
  3696. #         if item in oceni[person2]:
  3697. #             zaednicki.add(item)
  3698. #     # ako nemaat zaednicki rejtinzi, vrati 0
  3699.     if len(zaednicki) == 0: return 0
  3700. #     # Soberi gi kvadratite na zaednickite razliki
  3701.     suma = 0.0
  3702.     for item in zaednicki:
  3703.         ocena1 = oceni[person1][item]
  3704.         ocena2 = oceni[person2][item]
  3705.         suma += (ocena1 - ocena2) ** 2
  3706. #         print(item, person1, ocena1, person2, ocena2)
  3707.  
  3708.     return 1 / (1 + math.sqrt(suma))
  3709.  
  3710. def sim_pearson(oceni, p1, p2):
  3711.     # Se kreira recnik vo koj ke se cuvaat predmetite (filmovi) koi se oceneti od dvajcata
  3712.     # Vo recnikot ni se vazni samo klucevite za da gi cuvame iminjata na filmovite koi se zaednicki, a vrednostite ne ni se vazni
  3713.     zaednicki = set()
  3714.     for item in oceni[p1]:
  3715.         if item in oceni[p2]:
  3716.             zaednicki.add(item)
  3717.  
  3718.     # Se presmetuva brojot na predmeti oceneti od dvajcata
  3719.     n = len(zaednicki)
  3720.  
  3721.     # Ako nemaat zaednicki predmeti vrati korelacija 0
  3722.     if n == 0: return 0
  3723.  
  3724.     # Soberi gi zaednickite oceni (rejtinzi) za  sekoja licnost posebno
  3725.     sum1 = 0
  3726.     sum2 = 0
  3727.  
  3728.     # Soberi gi kvadratite od zaednickite oceni (rejtinzi) za  sekoja licnost posebno
  3729.     sum1Sq = 0
  3730.     sum2Sq = 0
  3731.  
  3732.     # Soberi gi proizvodite od ocenite na dvete licnosti
  3733.     pSum = 0
  3734.     for item in zaednicki:
  3735.         ocena1 = oceni[p1][item]
  3736.         ocena2 = oceni[p2][item]
  3737.         sum1 += ocena1
  3738.         sum1Sq += ocena1 ** 2
  3739.         sum2 += ocena2
  3740.         sum2Sq += ocena2 ** 2
  3741.         pSum += ocena1 * ocena2
  3742.  
  3743.     # Presmetaj go koeficientot na korelacija
  3744.     num = pSum - (sum1 * sum2 / n)
  3745.     den = math.sqrt((sum1Sq - pow(sum1, 2) / n) * (sum2Sq - pow(sum2, 2) / n))
  3746.     if den == 0: return 0
  3747.     r = num / den
  3748.     return r
  3749.  
  3750. def topMatches(oceni, person, n=5, similarity=sim_pearson):
  3751.     scores = []
  3752.     for person2 in oceni.keys():
  3753.         if person != person2:
  3754.             s = similarity(oceni, person, person2)
  3755.             scores.append((s, person2))
  3756.     # Se sortira listata vo rastecki redosled
  3757.     scores.sort()
  3758.     # Se prevrtuva za najslicnite (so najgolema vrednost) da bidat prvi
  3759.     scores.reverse()
  3760.     if n is None:
  3761.         return scores
  3762.     else:
  3763.         return scores[0:n]
  3764.  
  3765. def getUserBasedRecomendations(oceni, person, similarity=sim_pearson, min_zaednicki=None):
  3766.     totals = {}
  3767.     simSums = {}
  3768.     for person2 in oceni.keys():
  3769.         # Za da ne se sporeduva so samiot sebe
  3770.         if person2 == person: continue
  3771.         filmovi1=set(oceni[person].keys())
  3772.         filmovi2=set(oceni[person2].keys())
  3773.         zaednicki = filmovi1.intersection(filmovi2)
  3774.         # ova e ako se bara minimum zaednicki filmovi
  3775.         # za da se zemat vo predvid ocenite na drugiot korisnik
  3776.         if min_zaednicki and len(zaednicki)<min_zaednicki:
  3777.             # print('So korisnikot', person2, 'imame samo',len(zaednicki),'filmovi, pa go preskoknuvame')
  3778.             continue
  3779.         sim = similarity(oceni, person, person2)
  3780. #         print(person,person2,sim)
  3781.         # ne se zemaat vo predvid rezultati <= 0
  3782.         if sim <= 0: continue
  3783.         # print(person,person2,sim)
  3784.         for item in oceni[person2].keys():
  3785. #             print(item, oceni[person].get(item, None), oceni[person2].get(item, None))
  3786.             # za tekovniot korisnik gi zemame samo filmovite sto gi nemame veke gledano
  3787.             if item not in oceni[person]: # or oceni[person][item] == 0:
  3788.                 # similarity * Score   (Slicnost * Ocena)
  3789.                 # print(item, sim, oceni[person2][item], sim* oceni[person2][item])
  3790.                 totals.setdefault(item, 0)
  3791.                 totals[item] += oceni[person2][item] * sim
  3792.  
  3793.                 # Sumuma na slicnosti
  3794.                 simSums.setdefault(item, 0)
  3795.                 simSums[item] += sim
  3796.         # print()
  3797.     # return
  3798.     # print()
  3799.     # Kreiranje na normalizirana lista so rejtinzi
  3800.     # rankings = [(total / simSums[item], item) for item, total in totals.items()]
  3801.     rankings = []
  3802.     for item, weighted_score in totals.items():
  3803.         sim_total = simSums[item]
  3804.         my_score = round(weighted_score / sim_total, 1)
  3805.         # print(item, weighted_score, sim_total, my_score)
  3806.         rankings.append((my_score, item))
  3807.  
  3808.     # Sortiranje na listata vo rastecki redosled
  3809.     rankings.sort(reverse=True)
  3810.     # Prevrtuvanje na lista za najgolemite vrednosti da bidat napred
  3811. #     rankings.reverse()
  3812.     a = [item[1] for item in rankings][0:3]
  3813.     a.sort()
  3814.     return a
  3815.  
  3816. def transformoceni(oceni):
  3817.     result = {}
  3818.     for person in oceni.keys():
  3819.         for item in oceni[person]:
  3820.             result.setdefault(item, {})
  3821.             # Zameni gi mestata na licnosta i predmetot
  3822.             result[item][person] = oceni[person][item]
  3823.     return result
  3824.  
  3825. def getItemBasedRecomendations(critics, person1, n=3):
  3826.     oceni_po_film = transformoceni(critics)
  3827.     similarity_per_item = {}
  3828.     for item in critics[person1].keys():
  3829.         similar_items = topMatches(oceni_po_film, item, n=None)
  3830.         my_rating = critics[person1][item]
  3831.  
  3832.         for similarity, item2 in similar_items:
  3833.             if item2 in critics[person1] or similarity <= 0:
  3834. #                 print('Slicnost', similarity, 'na', item,'so', item2)
  3835.                 continue
  3836.             weight= similarity * my_rating
  3837. #             print('Slicnost', similarity, 'na', item,'so', item2, weight)
  3838.             similarity_per_item.setdefault(item2, [])
  3839.             similarity_per_item[item2].append(weight)
  3840. #         print(item, my_rating, list(similarity_per_item.items()))
  3841.     similarity_per_item_avg = []
  3842.     import numpy as np
  3843.     for item in similarity_per_item:
  3844.         #print(item, similarity_per_item[item])
  3845.         avg_sim = np.mean(similarity_per_item[item])
  3846.         similarity_per_item_avg.append((avg_sim, item))
  3847.     #similarity_per_item_avg.sort(reverse=True)
  3848.  
  3849.     similarity_per_item_avg.sort(reverse=True)
  3850.     novi = [] #[item[1] for item in similarity_per_item_avg if item[1] > 0][0:3]
  3851.  
  3852.     for item in similarity_per_item_avg:
  3853.         # print item
  3854.         novi.append(item[1])
  3855.     novi = novi[0:3]
  3856.     novi.sort()
  3857.     return novi
  3858.  
  3859.     #return li #similarity_per_item_avg[:n]
  3860.  
  3861. if __name__ == '__main__':
  3862.  
  3863.     k = input()
  3864.     n = input()
  3865.  
  3866.     long = len(oceniPoKorisnici[k].keys())
  3867.     if long < n:
  3868.         print 'item-based\n', getItemBasedRecomendations(oceniPoKorisnici,k)
  3869.     else:
  3870.         print 'user-based\n', getUserBasedRecomendations(oceniPoKorisnici, k)
  3871.  
  3872. ------------------------------------------------------------------------------------------------------
  3873. Sistemi za preporaka
  3874.  
  3875. """
  3876. По изработка на задачите од претходната вежба веќе ќе имате две тренинг множества претставени во Python како речник од речници. Искористете ги за изработка на систем за препораки така што да може на секој од тест корисниците да им предложи по 3 филмови, еднаш користејќи item-based, а еднаш user-based препораки. При item-based пристапот се предлагаат фимови кои ги нема гледано корисникот кои се со позитивна сличност со некои од филмовите кои ги има гледано. На излез треба да се печатат две листи кои ги содржат само имињата на предложените филмови во растечки (азбучен) редослед. Првата листа е според user-based, а втората според item-based пристап.
  3877.  
  3878. """
  3879.  
  3880. critics={
  3881.     'Lisa Rose': {'Catch Me If You Can': 3.0 , 'Snakes on a Plane': 3.5, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 'The Night Listener': 3.0, 'Snitch': 3.0},
  3882.     'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 'Just My Luck': 1.5,  'The Night Listener': 3.0,'You, Me and Dupree': 3.5},
  3883.     'Michael Phillips': {'Catch Me If You Can': 2.5, 'Lady in the Water': 2.5,'Superman Returns': 3.5, 'The Night Listener': 4.0, 'Snitch': 2.0},
  3884.     'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,'The Night Listener': 4.5, 'Superman Returns': 4.0,'You, Me and Dupree': 2.5},
  3885.     'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'Just My Luck': 2.0, 'Superman Returns': 3.0, 'You, Me and Dupree': 2.0},
  3886.     'Jack Matthews': {'Catch Me If You Can': 4.5, 'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5, 'Snitch': 4.5},
  3887.     'Toby': {'Snakes on a Plane':4.5, 'Snitch': 5.0},
  3888.     'Michelle Nichols': {'Just My Luck' : 1.0, 'The Night Listener': 4.5, 'You, Me and Dupree': 3.5, 'Catch Me If You Can': 2.5, 'Snakes on a Plane': 3.0},
  3889.     'Gary Coleman': {'Lady in the Water': 1.0, 'Catch Me If You Can': 1.5, 'Superman Returns': 1.5, 'You, Me and Dupree': 2.0},
  3890.     'Larry': {'Lady in the Water': 3.0, 'Just My Luck': 3.5, 'Snitch': 1.5, 'The Night Listener': 3.5}
  3891.     }
  3892.  
  3893.  
  3894. from math import sqrt
  3895.  
  3896. def sim_distance(oceni, person1, person2):
  3897.     filmovi1=set(oceni[person1].keys())
  3898.     filmovi2=set(oceni[person2].keys())
  3899.     zaednicki = filmovi1.intersection(filmovi2)
  3900.     #print(filmovi1)
  3901.     #print(filmovi2)
  3902.     #print(zaednicki)
  3903.  
  3904.     for item in oceni[person1].keys():
  3905.          if item in oceni[person2]:
  3906.              zaednicki.add(item)
  3907.  
  3908.     if len(zaednicki) == 0: return 0
  3909.  
  3910.     suma = 0.0
  3911.     for item in zaednicki:
  3912.         ocena1 = oceni[person1][item]
  3913.         ocena2 = oceni[person2][item]
  3914.         suma += (ocena1 - ocena2) ** 2
  3915. #         print(item, person1, ocena1, person2, ocena2)
  3916.  
  3917.     return 1 / (1 + sqrt(suma))
  3918.  
  3919. def sim_pearson(oceni, p1, p2):
  3920.     zaednicki = set()
  3921.     for item in oceni[p1]:
  3922.         if item in oceni[p2]:
  3923.             zaednicki.add(item)
  3924.  
  3925.     # Se presmetuva brojot na predmeti oceneti od dvajcata
  3926.     n = len(zaednicki)
  3927.  
  3928.     # Ako nemaat zaednicki predmeti vrati korelacija 0
  3929.     if n == 0: return 0
  3930.  
  3931.     # Soberi gi zaednickite oceni (rejtinzi) za  sekoja licnost posebno
  3932.     sum1 = 0
  3933.     sum2 = 0
  3934.  
  3935.     # Soberi gi kvadratite od zaednickite oceni (rejtinzi) za  sekoja licnost posebno
  3936.     sum1Sq = 0
  3937.     sum2Sq = 0
  3938.  
  3939.     # Soberi gi proizvodite od ocenite na dvete licnosti
  3940.     pSum = 0
  3941.     for item in zaednicki:
  3942.         ocena1 = oceni[p1][item]
  3943.         ocena2 = oceni[p2][item]
  3944.         sum1 += ocena1
  3945.         sum1Sq += ocena1 ** 2
  3946.         sum2 += ocena2
  3947.         sum2Sq += ocena2 ** 2
  3948.         pSum += ocena1 * ocena2
  3949.  
  3950.     # Presmetaj go koeficientot na korelacija
  3951.     num = pSum - (sum1 * sum2 / n)
  3952.     den = sqrt((sum1Sq - pow(sum1, 2) / n) * (sum2Sq - pow(sum2, 2) / n))
  3953.     if den == 0: return 0
  3954.     r = num / den
  3955.     return r
  3956.  
  3957. def transformPrefs(oceni):
  3958.     result = {}
  3959.     for person in oceni.keys():
  3960.         for item in oceni[person]:
  3961.             result.setdefault(item, {})
  3962.             # Zameni gi mestata na licnosta i predmetot
  3963.             result[item][person] = oceni[person][item]
  3964.     return result
  3965.  
  3966. def topMatches(oceni, person, n=5, similarity=sim_pearson):
  3967.     scores = []
  3968.     for person2 in oceni.keys():
  3969.         if person != person2:
  3970.             s = similarity(oceni, person, person2)
  3971.             scores.append((s, person2))
  3972.     # Se sortira listata vo rastecki redosled
  3973.     scores.sort()
  3974.     # Se prevrtuva za najslicnite (so najgolema vrednost) da bidat prvi
  3975.     scores.reverse()
  3976.     if n is None:
  3977.         return scores
  3978.     else:
  3979.         return scores[0:n]
  3980.  
  3981. def getRecommendations(oceni, person, similarity=sim_pearson, min_zaednicki=None):
  3982.     totals = {}
  3983.     simSums = {}
  3984.     for person2 in oceni.keys():
  3985.  
  3986.         filmovi1 = set(oceni[person].keys())
  3987.         filmovi2 = set(oceni[person2].keys())
  3988.         zaednicki = filmovi1.intersection(filmovi2)
  3989.         # ova e ako se bara minimum zaednicki filmovi
  3990.         # za da se zemat vo predvid ocenite na drugiot korisnik
  3991.         if min_zaednicki and len(zaednicki) < min_zaednicki:
  3992.             print('So korisnikot', person2, 'imame samo', len(zaednicki), 'filmovi, pa go preskoknuvame')
  3993.             continue
  3994.  
  3995.         if person2 == person:
  3996.             continue
  3997.  
  3998.         sim = similarity(oceni, person, person2)
  3999.  
  4000.         if sim <= 0:
  4001.             continue
  4002.  
  4003.         for item in oceni[person2]:
  4004.             if item not in oceni[person] or oceni[person][item] == 0:
  4005.                 # similarity * Score   (Slicnost * Ocena)
  4006.                 #print(item, sim, oceni[person2][item], sim* oceni[person2][item])
  4007.                 totals.setdefault(item, 0)
  4008.                 totals[item] += oceni[person2][item] * sim
  4009.  
  4010.                 # Sumuma na slicnosti
  4011.                 simSums.setdefault(item, 0)
  4012.                 simSums[item] += sim
  4013.  
  4014.     rankings = [(total / simSums[item], item) for item, total in totals.items()]
  4015.     #rankings = [item for item,v in totals.items()]
  4016.     rankings.sort(reverse=True)
  4017.     a = []
  4018.     for i in rankings:
  4019.         a.append(i[1])
  4020.     a = a[:3]
  4021.     a.sort()
  4022.     return a#sorted(rankings[0:3])
  4023.  
  4024. def getItemBasedRecomendations(oceni,korisnik,similarity=sim_pearson):
  4025.     rankings = []
  4026.     filmovi = oceni.keys()
  4027.     gledani = [item for item in filmovi if critics[korisnik].has_key(item)]
  4028.     negledani = [item for item in filmovi if not critics[korisnik].has_key(item)]
  4029.     #print gledani
  4030.     #print negledani
  4031.     slicnosti = {}
  4032.     for film in negledani:
  4033.         for drug in gledani:
  4034.             sim = similarity(oceni, film, drug)
  4035.             slicnosti.setdefault(film, 0);
  4036.             if slicnosti[film] < sim:
  4037.                 slicnosti[film] = sim
  4038.     stvari = slicnosti.items()
  4039.     #print stvari
  4040.     stvari.sort(key=lambda tup: tup[1], reverse=True)
  4041.     novi = [item[0] for item in stvari if item[1] > 0][0:3]
  4042.     novi.sort()
  4043.     return novi
  4044.  
  4045. def item_based(critics, person1, n=3):
  4046.     oceni_po_film = transformPrefs(critics)
  4047.     similarity_per_item = {}
  4048.     for item in critics[person1].keys():
  4049.         similar_items = topMatches(oceni_po_film, item, n=None)
  4050.         my_rating = critics[person1][item]
  4051.  
  4052.         for similarity, item2 in similar_items:
  4053.             if item2 in critics[person1] or similarity <= 0:
  4054. #                 print('Slicnost', similarity, 'na', item,'so', item2)
  4055.                 continue
  4056.             weight= similarity * my_rating
  4057. #             print('Slicnost', similarity, 'na', item,'so', item2, weight)
  4058.             similarity_per_item.setdefault(item2, [])
  4059.             similarity_per_item[item2].append(weight)
  4060. #         print(item, my_rating, list(similarity_per_item.items()))
  4061.     similarity_per_item_avg = []
  4062.     import numpy as np
  4063.     for item in similarity_per_item:
  4064.         #print(item, similarity_per_item[item])
  4065.         avg_sim = np.mean(similarity_per_item[item])
  4066.         similarity_per_item_avg.append((avg_sim, item))
  4067.     similarity_per_item_avg.sort(reverse=True)
  4068.     a = []
  4069.     for i in similarity_per_item_avg:
  4070.         a.append(i[1])
  4071.     a = a[:n]
  4072.     a.sort()
  4073.     return a #similarity_per_item_avg[:n]
  4074.  
  4075. if __name__ == "__main__":
  4076.     korisnik = input()
  4077.  
  4078.     print "user-based: " + str(getRecommendations(critics, korisnik))
  4079.     print "item-based: " + str(item_based(critics=critics, person1=korisnik))
  4080.  
  4081. ------------------------------------------------------------------------------------------------------
  4082. # -*- coding:utf-8 -*-
  4083.  
  4084. """
  4085. Да испрограмира функција за косинусна сличност која е дефинирана со следнава формула, каде A е листа со оцените на едниот корисник или филм, а B е листа со оцените на другиот корисник или филм:
  4086. enter image description here
  4087. Притоа треба да се избегне делење со нула и во тој случај да се смета дека сличноста е -1.
  4088. Речник со оцени на корисници по филмови треба е веќе даден. Од стандардниот влез се вчитува име на еден филм. Да се испечати сличноста на прочитаниот филм со секој друг филм (освен самиот со себе) така што ќе се печати:
  4089. Филм 2
  4090. Косинусна сличност, Пеарсонова сличност, Евклидова сличност
  4091. Празна линија
  4092. При печатењето филмовите треба да бидат подредени по азбучен редослед. Сите сличности треба да бидат заокружени на 2 децимали.
  4093. """
  4094.  
  4095.  
  4096. from math import sqrt
  4097.  
  4098.  
  4099.  
  4100. oceniPoKorisnici={
  4101.     'Lisa Rose': {'Catch Me If You Can': 3.0 , 'Snakes on a Plane': 3.5, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 'The Night Listener': 3.0, 'Snitch': 3.0},
  4102.     'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 'Just My Luck': 1.5,  'The Night Listener': 3.0,'You, Me and Dupree': 3.5},
  4103.     'Michael Phillips': {'Catch Me If You Can': 2.5, 'Lady in the Water': 2.5,'Superman Returns': 3.5, 'The Night Listener': 4.0, 'Snitch': 2.0},
  4104.     'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,'The Night Listener': 4.5, 'Superman Returns': 4.0,'You, Me and Dupree': 2.5},
  4105.     'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'Just My Luck': 2.0, 'Superman Returns': 3.0, 'You, Me and Dupree': 2.0},
  4106.     'Jack Matthews': {'Catch Me If You Can': 4.5, 'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5, 'Snitch': 4.5},
  4107.     'Toby': {'Snakes on a Plane':4.5, 'Snitch': 5.0},
  4108.     'Michelle Nichols': {'Just My Luck' : 1.0, 'The Night Listener': 4.5, 'You, Me and Dupree': 3.5, 'Catch Me If You Can': 2.5, 'Snakes on a Plane': 3.0},
  4109.     'Gary Coleman': {'Lady in the Water': 1.0, 'Catch Me If You Can': 1.5, 'Superman Returns': 1.5, 'You, Me and Dupree': 2.0},
  4110.     'Larry': {'Lady in the Water': 3.0, 'Just My Luck': 3.5, 'Snitch': 1.5, 'The Night Listener': 3.5}
  4111.     }
  4112.  
  4113.  
  4114.  
  4115. def sim_cos(oceni, p1, p2):
  4116.  
  4117.     zaednicki = set()
  4118.     for item in oceni[p1]:
  4119.         if item in oceni[p2]:
  4120.             zaednicki.add(item)
  4121.  
  4122.     #print(zaednicki)
  4123.     #return
  4124.  
  4125.     # Se presmetuva brojot na predmeti oceneti od dvajcata
  4126.     n = len(zaednicki)
  4127.  
  4128.     # Ako nemaat zaednicki predmeti vrati korelacija 0
  4129.     if n == 0: return 0
  4130.  
  4131.     # Soberi gi zaednickite oceni (rejtinzi) za  sekoja licnost posebno
  4132.     sum1 = 0
  4133.     sum2 = 0
  4134.  
  4135.     # Soberi gi kvadratite od zaednickite oceni (rejtinzi) za  sekoja licnost posebno
  4136.     sum1Sq = 0
  4137.     sum2Sq = 0
  4138.  
  4139.     # Soberi gi proizvodite od ocenite na dvete licnosti
  4140.     pSum = 0
  4141.     for item in zaednicki:
  4142.         ocena1 = oceni[p1][item]
  4143.         ocena2 = oceni[p2][item]
  4144.  
  4145.         sum1 += ocena1
  4146.         sum1Sq += ocena1 ** 2
  4147.         sum2 += ocena2
  4148.         sum2Sq += ocena2 ** 2
  4149.         pSum += ocena1 * ocena2
  4150.  
  4151.     # Presmetaj go koeficientot na korelacija
  4152.     num = pSum
  4153.     den = sqrt(sum1Sq) * sqrt(sum2Sq)
  4154.     if den == 0: return -1
  4155.     r = num / den
  4156.     return round(r,2)
  4157.  
  4158. def sim_distance(oceni, person1, person2):
  4159.     # Se pravi lista na zaednicki predmeti (filmovi)
  4160.  
  4161.     filmovi1=set(oceni[person1].keys())
  4162.     filmovi2=set(oceni[person2].keys())
  4163.     zaednicki = filmovi1.intersection(filmovi2)
  4164. #     print(filmovi1)
  4165. #     print(filmovi2)
  4166. #     print(zaednicki)
  4167. #     for item in oceni[person1].keys():
  4168. #         if item in oceni[person2]:
  4169. #             zaednicki.add(item)
  4170. #     # ako nemaat zaednicki rejtinzi, vrati 0
  4171.     if len(zaednicki) == 0: return 0
  4172. #     # Soberi gi kvadratite na zaednickite razliki
  4173.     suma = 0.0
  4174.     for item in zaednicki:
  4175.         ocena1 = oceni[person1][item]
  4176.         ocena2 = oceni[person2][item]
  4177.         suma += (ocena1 - ocena2) ** 2
  4178. #         print(item, person1, ocena1, person2, ocena2)
  4179.  
  4180.     return round(1 / (1 + sqrt(suma)),2)
  4181.  
  4182. def sim_pearson(oceni, p1, p2):
  4183.     # Se kreira recnik vo koj ke se cuvaat predmetite (filmovi) koi se oceneti od dvajcata
  4184.     # Vo recnikot ni se vazni samo klucevite za da gi cuvame iminjata na filmovite koi se zaednicki, a vrednostite ne ni se vazni
  4185.     zaednicki = set()
  4186.     for item in oceni[p1]:
  4187.         if item in oceni[p2]:
  4188.             zaednicki.add(item)
  4189.  
  4190.     # Se presmetuva brojot na predmeti oceneti od dvajcata
  4191.     n = len(zaednicki)
  4192.  
  4193.     # Ako nemaat zaednicki predmeti vrati korelacija 0
  4194.     if n == 0: return 0
  4195.  
  4196.     # Soberi gi zaednickite oceni (rejtinzi) za  sekoja licnost posebno
  4197.     sum1 = 0
  4198.     sum2 = 0
  4199.  
  4200.     # Soberi gi kvadratite od zaednickite oceni (rejtinzi) za  sekoja licnost posebno
  4201.     sum1Sq = 0
  4202.     sum2Sq = 0
  4203.  
  4204.     # Soberi gi proizvodite od ocenite na dvete licnosti
  4205.     pSum = 0
  4206.     for item in zaednicki:
  4207.         ocena1 = oceni[p1][item]
  4208.         ocena2 = oceni[p2][item]
  4209.         sum1 += ocena1
  4210.         sum1Sq += ocena1 ** 2
  4211.         sum2 += ocena2
  4212.         sum2Sq += ocena2 ** 2
  4213.         pSum += ocena1 * ocena2
  4214.  
  4215.     # Presmetaj go koeficientot na korelacija
  4216.     num = pSum - (sum1 * sum2 / n)
  4217.     den = sqrt((sum1Sq - pow(sum1, 2) / n) * (sum2Sq - pow(sum2, 2) / n))
  4218.     if den == 0: return 0
  4219.     r = num / den
  4220.     return round(r,2)
  4221.  
  4222. def topMatches(oceni, person, n=5, similarity=sim_pearson):
  4223.     scores = []
  4224.     for person2 in oceni.keys():
  4225.         if person != person2:
  4226.             s = similarity(oceni, person, person2)
  4227.             scores.append((s, person2))
  4228.     # Se sortira listata vo rastecki redosled
  4229.     scores.sort()
  4230.     # Se prevrtuva za najslicnite (so najgolema vrednost) da bidat prvi
  4231.     scores.reverse()
  4232.     if n is None:
  4233.         return scores
  4234.     else:
  4235.         return scores[0:n]
  4236.  
  4237. def transformoceni(oceni):
  4238.     result = {}
  4239.     for person in oceni.keys():
  4240.         for item in oceni[person]:
  4241.             result.setdefault(item, {})
  4242.             # Zameni gi mestata na licnosta i predmetot
  4243.             result[item][person] = oceni[person][item]
  4244.     return result
  4245.  
  4246. def getRecommendations(oceni, person, similarity=sim_pearson, min_zaednicki=None):
  4247.     totals = {}
  4248.     simSums = {}
  4249.     for person2 in oceni.keys():
  4250.         # Za da ne se sporeduva so samiot sebe
  4251.         if person2 == person: continue
  4252.         filmovi1=set(oceni[person].keys())
  4253.         filmovi2=set(oceni[person2].keys())
  4254.         zaednicki = filmovi1.intersection(filmovi2)
  4255.         # ova e ako se bara minimum zaednicki filmovi
  4256.         # za da se zemat vo predvid ocenite na drugiot korisnik
  4257.         if min_zaednicki and len(zaednicki)<min_zaednicki:
  4258.             print('So korisnikot', person2, 'imame samo',len(zaednicki),'filmovi, pa go preskoknuvame')
  4259.             continue
  4260.         sim = similarity(oceni, person, person2)
  4261. #         print(person,person2,sim)
  4262.         # ne se zemaat vo predvid rezultati <= 0
  4263.         if sim <= 0: continue
  4264.         print(person,person2,sim)
  4265.         for item in oceni[person2].keys():
  4266. #             print(item, oceni[person].get(item, None), oceni[person2].get(item, None))
  4267.             # za tekovniot korisnik gi zemame samo filmovite sto gi nemame veke gledano
  4268.             if item not in oceni[person]: # or oceni[person][item] == 0:
  4269.                 # similarity * Score   (Slicnost * Ocena)
  4270.                 print(item, sim, oceni[person2][item], sim* oceni[person2][item])
  4271.                 totals.setdefault(item, 0)
  4272.                 totals[item] += oceni[person2][item] * sim
  4273.  
  4274.                 # Sumuma na slicnosti
  4275.                 simSums.setdefault(item, 0)
  4276.                 simSums[item] += sim
  4277.         print()
  4278. #     return
  4279.     print()
  4280.     # Kreiranje na normalizirana lista so rejtinzi
  4281.     # rankings = [(total / simSums[item], item) for item, total in totals.items()]
  4282.     rankings = []
  4283.     for item, weighted_score in totals.items():
  4284.         sim_total = simSums[item]
  4285.         my_score = round(weighted_score / sim_total, 1)
  4286.         print(item, weighted_score, sim_total, my_score)
  4287.         rankings.append((my_score, item))
  4288.  
  4289.     # Sortiranje na listata vo rastecki redosled
  4290.     rankings.sort(reverse=True)
  4291.     # Prevrtuvanje na lista za najgolemite vrednosti da bidat napred
  4292. #     rankings.reverse()
  4293.     return rankings
  4294.  
  4295. def item_based(critics, person1, n=3):
  4296.     oceni_po_film = transformoceni(critics)
  4297.     similarity_per_item = {}
  4298.     for item in critics[person1].keys():
  4299.         similar_items = topMatches(oceni_po_film, item, n=None)
  4300.         my_rating = critics[person1][item]
  4301.  
  4302.         for similarity, item2 in similar_items:
  4303.             if item2 in critics[person1] or similarity <= 0:
  4304. #                 print('Slicnost', similarity, 'na', item,'so', item2)
  4305.                 continue
  4306.             weight= similarity * my_rating
  4307. #             print('Slicnost', similarity, 'na', item,'so', item2, weight)
  4308.             similarity_per_item.setdefault(item2, [])
  4309.             similarity_per_item[item2].append(weight)
  4310. #         print(item, my_rating, list(similarity_per_item.items()))
  4311.     similarity_per_item_avg = []
  4312.     import numpy as np
  4313.     for item in similarity_per_item:
  4314.         print(item, similarity_per_item[item])
  4315.         avg_sim = np.mean(similarity_per_item[item])
  4316.         similarity_per_item_avg.append((avg_sim, item))
  4317.     similarity_per_item_avg.sort(reverse=True)
  4318.     return similarity_per_item_avg[:n]
  4319.  
  4320. def transformoceni(oceni):
  4321.     result = {}
  4322.     for person in oceni.keys():
  4323.         for item in oceni[person]:
  4324.             result.setdefault(item, {})
  4325.             # Zameni gi mestata na licnosta i predmetot
  4326.             result[item][person] = oceni[person][item]
  4327.     return result
  4328.  
  4329.  
  4330. if __name__ == '__main__':
  4331.  
  4332.     film = 'Catch Me If You Can'
  4333.  
  4334.     # film = input()
  4335.  
  4336.     movie_base=transformoceni(oceniPoKorisnici)
  4337.  
  4338.     for k in sorted(movie_base.keys()):
  4339.         if film == k:
  4340.             continue
  4341.         else:
  4342.             print(k)
  4343.             print sim_cos(movie_base,film,k), sim_pearson(movie_base,film,k), sim_distance(movie_base,film,k)
  4344.             print
  4345.  
  4346. ------------------------------------------------------------------------------------------------------
  4347. Sistemi za preporaka januari 2017
  4348.  
  4349. За корисникот внесен на влез да се препорача филм. Да се користи Пирсонов коефициент на корелација како мерка.
  4350. Ако корисникот го нема во базата да се препорача најгледаниот филм. Доколку корисникот има гледано повеќе од 5 филмови
  4351. , да се препорача според филмовите, во спротивно да се препорача според корисниците кои се слични со него.
  4352.  
  4353. from __future__ import print_function
  4354. import json
  4355. from math import sqrt
  4356.  
  4357. # A dictionary of movie critics and their ratings of a small set of movies
  4358. critics = {
  4359.     'Lisa Rose': {'Catch Me If You Can': 3.0, 'Snakes on a Plane': 3.5, 'Superman Returns': 3.5,
  4360.                   'You, Me and Dupree': 2.5, 'The Night Listener': 3.0, 'Snitch': 3.0},
  4361.     'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 'Just My Luck': 1.5, 'The Night Listener': 3.0,
  4362.                      'You, Me and Dupree': 3.5},
  4363.     'Michael Phillips': {'Catch Me If You Can': 2.5, 'Lady in the Water': 2.5, 'Superman Returns': 3.5,
  4364.                          'The Night Listener': 4.0, 'Snitch': 2.0},
  4365.     'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0, 'The Night Listener': 4.5, 'Superman Returns': 4.0,
  4366.                      'You, Me and Dupree': 2.5},
  4367.     'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0, 'Just My Luck': 2.0, 'Superman Returns': 3.0,
  4368.                      'You, Me and Dupree': 2.0},
  4369.     'Jack Matthews': {'Catch Me If You Can': 4.5, 'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
  4370.                       'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5, 'Snitch': 4.5},
  4371.     'Toby': {'Snakes on a Plane': 4.5, 'Snitch': 5.0},
  4372.     'Michelle Nichols': {'Just My Luck': 1.0, 'The Night Listener': 4.5, 'You, Me and Dupree': 3.5,
  4373.                          'Catch Me If You Can': 2.5, 'Snakes on a Plane': 3.0},
  4374.     'Gary Coleman': {'Lady in the Water': 1.0, 'Catch Me If You Can': 1.5, 'Superman Returns': 1.5,
  4375.                      'You, Me and Dupree': 2.0},
  4376.     'Larry': {'Lady in the Water': 3.0, 'Just My Luck': 3.5, 'Snitch': 1.5, 'The Night Listener': 3.5}
  4377. }
  4378.  
  4379. def sim_distance(oceni,person1,person2):
  4380.     si={}
  4381.     for item in oceni[person1]:
  4382.         if item in oceni[person2]:
  4383.             si[item]=1
  4384.     if len(si)==0: return 0
  4385.     sum_of_squares=sum([pow(oceni[person1][item]-oceni[person2][item],2)
  4386.     for item in oceni[person1] if item in oceni[person2]])
  4387.     return 1/(1+sqrt(sum_of_squares))
  4388.  
  4389. def sim_pearson(oceni,person1,person2):
  4390.     si={}
  4391.     for item in oceni[person1]:
  4392.         if item in oceni[person2]: si[item]=1
  4393.     n=len(si)
  4394.     if n==0: return 0
  4395.     sum1=sum([oceni[person1][it] for it in si])
  4396.     sum2=sum([oceni[person2][it] for it in si])
  4397.     sum1Sq=sum([pow(oceni[person1][it],2) for it in si])
  4398.     sum2Sq=sum([pow(oceni[person2][it],2) for it in si])
  4399.     pSum=sum([oceni[person1][it]*oceni[person2][it] for it in si])
  4400.     num=pSum-(sum1*sum2/n)
  4401.     den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
  4402.     if den==0: return 0
  4403.     r=num/den
  4404.     return r
  4405.    
  4406. def transformPrefs(prefs):
  4407.     result = {}
  4408.     for person in prefs:
  4409.         for item in prefs[person]:
  4410.             result.setdefault(item,{})
  4411.             result[item][person]=prefs[person][item]
  4412.     return result
  4413.  
  4414. def topMatches(prefs,person,n=4,similarity=sim_pearson):
  4415.     scores=[(similarity(prefs,person,other),other)
  4416.             for other in prefs if other!=person]
  4417.     scores.sort()
  4418.     scores.reverse()
  4419.     return scores[0:n]
  4420.  
  4421. def getUserBasedRecommendations(oceni,korisnik,similarity=sim_pearson):
  4422.     totals={}
  4423.     simSums={}
  4424.     for other in oceni:
  4425.         if other==korisnik: continue
  4426.         sim=similarity(oceni,korisnik,other)
  4427.         if sim<=0: continue
  4428.         for item in oceni[other]:
  4429.             if item not in oceni[korisnik] or oceni[korisnik][item]==0:
  4430.                 totals.setdefault(item,0)
  4431.                 totals[item]+=oceni[other][item]*sim
  4432.                 simSums.setdefault(item,0)
  4433.                 simSums[item]+=sim
  4434.                
  4435.     rankings=[(total/simSums[item],item) for item,total in totals.items()]
  4436.     rankings.sort()
  4437.     rankings.reverse()
  4438.     rankings = rankings[0:3]
  4439.     return rankings
  4440.    
  4441. def getItemBasedRecomendations(oceni,korisnik,similarity=sim_pearson):
  4442.     similar={}
  4443.     films=transformPrefs(oceni)
  4444.     for gledanFilm in oceni[korisnik]:
  4445.             similar_filmovi=topMatches(films,gledanFilm)
  4446.             for slicnost,slicen_film in similar_filmovi:
  4447.                 if slicen_film not in oceni[korisnik] and (slicen_film not in similar or slicnost>similar[slicen_film]):
  4448.                     similar[slicen_film]=slicnost
  4449.    
  4450.     rankings=sorted(similar, key=similar.get)
  4451.     rankings.reverse()
  4452.     rankings=rankings[0:3]
  4453.     rankings.sort()
  4454.     return rankings
  4455.    
  4456.  
  4457. def transformoceni(oceni):
  4458.     result={}
  4459.     for person in oceni:
  4460.         for item in oceni[person]:
  4461.             result.setdefault(item,{})
  4462.             result[item][person]=oceni[person][item]
  4463.     return result
  4464.  
  4465.  
  4466. def item_based(critics, person1, n=3):
  4467.     oceni_po_film = transformoceni(critics)
  4468.     similarity_per_item = {}
  4469.     for item in critics[person1].keys():
  4470.         similar_items = topMatches(oceni_po_film, item, n=None)
  4471.         my_rating = critics[person1][item]
  4472.         for similarity, item in similar_items:
  4473.             if item in critics[person1] or similarity <= 0:
  4474.                 continue
  4475.             similarity_per_item.setdefault(item, [])
  4476.             similarity_per_item[item].append(similarity * my_rating)
  4477.     similarity_per_item_avg = []
  4478.     import numpy as np
  4479.     for item in similarity_per_item:    
  4480.         avg_sim = np.mean(similarity_per_item[item])
  4481.         similarity_per_item_avg.append((avg_sim, item))
  4482.     similarity_per_item_avg.sort(reverse=True)
  4483.     return similarity_per_item_avg[:n]
  4484.  
  4485. if __name__ == "__main__":
  4486.     korisnik = input()
  4487.     inverse = transformoceni(critics)
  4488.     korisniciIFilmovi = inverse.items()
  4489.  
  4490.     korisniciIFilmovi.sort(key=lambda tup: len(tup[1].items()), reverse=True)
  4491.  
  4492.     najgledan = korisniciIFilmovi[0][0]
  4493.  
  4494.     if not korisnik in critics:
  4495.         print(najgledan)
  4496.         exit()
  4497.  
  4498.     oceniNaKorisnik = critics[korisnik]
  4499.     filmoviNaKorisnik = oceniNaKorisnik.keys()
  4500.  
  4501.     imaPoveke = False
  4502.     for k in critics:
  4503.         if k != korisnik:
  4504.             brojac = 0
  4505.             for film in filmoviNaKorisnik:
  4506.                 brojac += 1
  4507.  
  4508.             if brojac > 5:
  4509.                 imaPoveke = True
  4510.  
  4511.  
  4512.     if imaPoveke:
  4513.         print (str(getItemBasedRecomendations(critics, korisnik)[0]))
  4514.     else:
  4515.         print (str(getUserBasedRecommendations(critics, korisnik)[0][1]))
  4516.  
  4517. ------------------------------------------------------------------------------------------------------
Advertisement
Add Comment
Please, Sign In to add comment