Advertisement
lameski

Untitled

Jun 9th, 2017
79
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.27 KB | None | 0 0
  1. trainingData=[['slashdot','USA','yes',18,'None'],
  2. ['google','France','yes',23,'Premium'],
  3. ['google','France','yes',23,'Basic'],
  4. ['google','France','yes',23,'Basic'],
  5. ['digg','USA','yes',24,'Basic'],
  6. ['kiwitobes','France','yes',23,'Basic'],
  7. ['google','UK','no',21,'Premium'],
  8. ['(direct)','New Zealand','no',12,'None'],
  9. ['(direct)','UK','no',21,'Basic'],
  10. ['google','USA','no',24,'Premium'],
  11. ['slashdot','France','yes',19,'None'],
  12. ['digg','USA','no',18,'None'],
  13. ['google','UK','no',18,'None'],
  14. ['kiwitobes','UK','no',19,'None'],
  15. ['digg','New Zealand','yes',12,'Basic'],
  16. ['slashdot','UK','no',21,'None'],
  17. ['google','UK','yes',18,'Basic'],
  18. ['kiwitobes','France','yes',19,'Basic']]
  19.  
  20. # my_data=[line.split('\t') for line in file('decision_tree_example.txt')]
  21.  
  22. class decisionnode:
  23. def __init__(self,col=-1,value=None,results=None,tb=None,fb=None):
  24. self.col=col
  25. self.value=value
  26. self.results=results
  27. self.tb=tb
  28. self.fb=fb
  29.  
  30. def sporedi_broj(row,column,value):
  31. return row[column]>=value
  32.  
  33. def sporedi_string(row,column,value):
  34. return row[column]==value
  35.  
  36. # Divides a set on a specific column. Can handle numeric
  37. # or nominal values
  38. def divideset(rows,column,value):
  39. # Make a function that tells us if a row is in
  40. # the first group (true) or the second group (false)
  41. split_function=None
  42. if isinstance(value,int) or isinstance(value,float): # ako vrednosta so koja sporeduvame e od tip int ili float
  43. #split_function=lambda row:row[column]>=value # togas vrati funkcija cij argument e row i vrakja vrednost true ili false
  44. split_function=sporedi_broj
  45. else:
  46. # split_function=lambda row:row[column]==value # ako vrednosta so koja sporeduvame e od drug tip (string)
  47. split_function=sporedi_string
  48.  
  49. # Divide the rows into two sets and return them
  50. # set1=[row for row in rows if split_function(row)] # za sekoj row od rows za koj split_function vrakja true
  51. # set2=[row for row in rows if not split_function(row)] # za sekoj row od rows za koj split_function vrakja false
  52. set1=[row for row in rows if split_function(row,column,value)] # za sekoj row od rows za koj split_function vrakja true
  53. set2=[row for row in rows if not split_function(row,column,value)] # za sekoj row od rows za koj split_function vrakja false
  54. return (set1,set2)
  55.  
  56. # Create counts of possible results (the last column of
  57. # each row is the result)
  58. def uniquecounts(rows):
  59. results={}
  60. for row in rows:
  61. # The result is the last column
  62. r=row[len(row)-1]
  63. if r not in results: results[r]=0
  64. results[r]+=1
  65. return results
  66.  
  67. # Probability that a randomly placed item will
  68. # be in the wrong category
  69. def giniimpurity(rows):
  70. total=len(rows)
  71. counts=uniquecounts(rows)
  72. imp=0
  73. for k1 in counts:
  74. p1=float(counts[k1])/total
  75. for k2 in counts:
  76. if k1==k2: continue
  77. p2=float(counts[k2])/total
  78. imp+=p1*p2
  79. return imp
  80.  
  81.  
  82. # Entropy is the sum of p(x)log(p(x)) across all
  83. # the different possible results
  84. def entropy(rows):
  85. from math import log
  86. log2=lambda x:log(x)/log(2)
  87. results=uniquecounts(rows)
  88. # Now calculate the entropy
  89. ent=0.0
  90. for r in results.keys():
  91. p=float(results[r])/len(rows)
  92. ent=ent-p*log2(p)
  93. return ent
  94.  
  95. def buildtree(rows,scoref=entropy):
  96. if len(rows)==0: return decisionnode()
  97. current_score=scoref(rows)
  98.  
  99. # Set up some variables to track the best criteria
  100. best_gain=0.0
  101. best_criteria=None
  102. best_sets=None
  103.  
  104. column_count=len(rows[0])-1
  105. for col in range(0,column_count):
  106. # Generate the list of different values in
  107. # this column
  108. column_values={}
  109. for row in rows:
  110. column_values[row[col]]=1
  111. print
  112. # Now try dividing the rows up for each value
  113. # in this column
  114. for value in column_values.keys():
  115. (set1,set2)=divideset(rows,col,value)
  116.  
  117. # Information gain
  118. p=float(len(set1))/len(rows)
  119. gain=current_score-p*scoref(set1)-(1-p)*scoref(set2)
  120. if gain>best_gain and len(set1)>0 and len(set2)>0:
  121. best_gain=gain
  122. best_criteria=(col,value)
  123. best_sets=(set1,set2)
  124.  
  125. # Create the subbranches
  126. if best_gain>0:
  127. trueBranch=buildtree(best_sets[0])
  128. falseBranch=buildtree(best_sets[1])
  129. return decisionnode(col=best_criteria[0],value=best_criteria[1],
  130. tb=trueBranch, fb=falseBranch)
  131. else:
  132. return decisionnode(results=uniquecounts(rows))
  133.  
  134. def printtree(tree,indent=''):
  135. # Is this a leaf node?
  136. if tree.results!=None:
  137. print str(tree.results)
  138. else:
  139. # Print the criteria
  140. print str(tree.col)+':'+str(tree.value)+'? '
  141. # Print the branches
  142. print indent+'T->',
  143. printtree(tree.tb,indent+' ')
  144. print indent+'F->',
  145. printtree(tree.fb,indent+' ')
  146.  
  147.  
  148. def classify(observation,tree):
  149. if tree.results!=None:
  150. return tree.results
  151. else:
  152. vrednost=observation[tree.col]
  153. branch=None
  154.  
  155. if isinstance(vrednost,int) or isinstance(vrednost,float):
  156. if vrednost>=tree.value: branch=tree.tb
  157. else: branch=tree.fb
  158. else:
  159. if vrednost==tree.value: branch=tree.tb
  160. else: branch=tree.fb
  161.  
  162. return classify(observation,branch)
  163.  
  164.  
  165. #(s1,s2)=divideset(my_data,2,'yes')
  166. #(sa1,sa2)=divideset(my_data,0,'google')
  167. #(sb1,sb2)=divideset(my_data,1,'USA')
  168.  
  169. #print len(s1),len(s2),uniquecounts(my_data)
  170. #print entropy(my_data),giniimpurity(my_data)
  171. #print entropy(s1),giniimpurity(s1)
  172. #print entropy(s2),giniimpurity(s2)
  173. #t= buildtree(my_data)
  174. # drawtree(t)
  175. #printtree(t)
  176. #for test_case in test_cases:
  177. # print "Nepoznat slucaj:", test_case, " Klasifikacija: ", classify(test_case,t)
  178.  
  179. if __name__ == "__main__":
  180. # referrer='slashdot'
  181. # location='UK'
  182. # readFAQ='no'
  183. # pagesVisited=21
  184. # serviceChosen='Unknown'
  185.  
  186.  
  187. referrer=input()
  188. location=input()
  189. readFAQ=input()
  190. pagesVisited=input()
  191. serviceChosen=input()
  192.  
  193. testCase=[referrer, location, readFAQ, pagesVisited, serviceChosen]
  194.  
  195. t=buildtree(trainingData)
  196. len1=len(trainingData)/2
  197. tt=trainingData
  198.  
  199. t1=[]
  200. while len1!=-1:
  201. t1.append(tt[len1])
  202. len1-=1
  203. len2=len(trainingData)-1
  204. len1=len(trainingData)/2
  205. t2=[]
  206. while len1!=len2:
  207. t2.append(tt[len2])
  208. # print tt[len2]
  209. len2-=1
  210. print t1
  211. print t2
  212. a1=classify(testCase,t1).keys()
  213.  
  214. a2=classify(testCase,t2).keys()
  215.  
  216. a1.sort()
  217. a2.sort()
  218. if a1[0]==a2[0]:
  219. print a1[0]
  220. else:
  221. print "KONTRADIKCIJA"
  222.  
  223. #a=classify(testCase,t).keys()
  224. #a.sort()
  225. #print a[0]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement