Advertisement
Guest User

Untitled

a guest
Oct 25th, 2016
74
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.99 KB | None | 0 0
  1. trainingData=[['slashdot','USA','yes',18,'None'],
  2. ['google','France','yes',23,'Premium'],
  3. ['google','France','yes',23,'Basic'],
  4. ['google','France','yes',23,'Basic'],
  5. ['digg','USA','yes',24,'Basic'],
  6. ['kiwitobes','France','yes',23,'Basic'],
  7. ['google','UK','no',21,'Premium'],
  8. ['(direct)','New Zealand','no',12,'None'],
  9. ['(direct)','UK','no',21,'Basic'],
  10. ['google','USA','no',24,'Premium'],
  11. ['slashdot','France','yes',19,'None'],
  12. ['digg','USA','no',18,'None'],
  13. ['google','UK','no',18,'None'],
  14. ['kiwitobes','UK','no',19,'None'],
  15. ['digg','New Zealand','yes',12,'Basic'],
  16. ['slashdot','UK','no',21,'None'],
  17. ['google','UK','yes',18,'Basic'],
  18. ['kiwitobes','France','yes',19,'Basic']]
  19.  
  20. class decisionnode:
  21. def __init__(self,col=-1,value=None,results=None,tb=None,fb=None,nivo=None):
  22. self.col=col
  23. self.value=value
  24. self.results=results
  25. self.tb=tb
  26. self.fb=fb
  27. self.nivo=nivo
  28.  
  29. def sporedi_broj(row,column,value):
  30. return row[column]>=value
  31.  
  32. def sporedi_string(row,column,value):
  33. return row[column]==value
  34.  
  35.  
  36. def divideset(rows,column,value):
  37. split_function=None
  38. if isinstance(value,int) or isinstance(value,float):
  39. split_function=sporedi_broj
  40. else:
  41. # split_function=lambda row:row[column]==value # ako vrednosta so koja sporeduvame e od drug tip (string)
  42. split_function=sporedi_string
  43.  
  44. # Divide the rows into two sets and return them
  45. # set1=[row for row in rows if split_function(row)] # za sekoj row od rows za koj split_function vrakja true
  46. # set2=[row for row in rows if not split_function(row)] # za sekoj row od rows za koj split_function vrakja false
  47. set1=[row for row in rows if split_function(row,column,value)] # za sekoj row od rows za koj split_function vrakja true
  48. set2=[row for row in rows if not split_function(row,column,value)] # za sekoj row od rows za koj split_function vrakja false
  49.  
  50. return (set1,set2)
  51.  
  52. # Divides a set on a specific column. Can handle numeric
  53. # or nominal values
  54. def divideset2(rows,column,value):
  55. # Make a function that tells us if a row is in
  56. # the first group (true) or the second group (false)
  57. split_function=None
  58. if isinstance(value,int) or isinstance(value,float): # ako vrednosta so koja sporeduvame e od tip int ili float
  59. #split_function=lambda row:row[column]>=value # togas vrati funkcija cij argument e row i vrakja vrednost true ili false
  60. split_function=sporedi_broj
  61. else:
  62. # split_function=lambda row:row[column]==value # ako vrednosta so koja sporeduvame e od drug tip (string)
  63. split_function=sporedi_string
  64.  
  65. # Divide the rows into two sets and return them
  66. # set1=[row for row in rows if split_function(row)] # za sekoj row od rows za koj split_function vrakja true
  67. # set2=[row for row in rows if not split_function(row)] # za sekoj row od rows za koj split_function vrakja false
  68. set1=[]
  69. set2=[]
  70. for row in rows:
  71. if split_function(row,column,value):
  72. set1.append(row)
  73. else:
  74. set2.append(row)
  75. return (set1,set2)
  76.  
  77.  
  78. # Create counts of possible results (the last column of
  79. # each row is the result)
  80. def uniquecounts(rows):
  81. results={}
  82. for row in rows:
  83. # The result is the last column
  84. r=row[len(row)-1]
  85. if r not in results: results[r]=0
  86. results[r]+=1
  87. return results
  88.  
  89. def uniquecounts2(rows):
  90. results={}
  91. for row in rows:
  92. # The result is the last column
  93. r=row[-1]
  94. results.setdefault(r,0)
  95. results[r]+=1
  96. return results
  97.  
  98.  
  99.  
  100. def entropy(rows):
  101. from math import log
  102. log2=lambda x:log(x)/log(2)
  103. results=uniquecounts(rows)
  104. ent=0.0
  105. for r in results.keys():
  106. # print r,results[r]
  107. p=float(results[r])/len(rows)
  108. ent=ent-p*log2(p)
  109. return ent
  110.  
  111. def entropy2(rows):
  112. from math import log2
  113. results=uniquecounts(rows)
  114. # Now calculate the entropy
  115. ent=0.0
  116. for r in results.keys():
  117. p=float(results[r])/len(rows)
  118. ent=ent-p*log2(p)
  119. return ent
  120.  
  121.  
  122. def buildtree(rows, scoref=entropy,nivo=0):
  123. if len(rows)==0: return decisionnode()
  124. current_score=scoref(rows)
  125.  
  126. best_gain=0.0
  127. best_criteria=None
  128. best_sets=None
  129.  
  130. column_count=len(rows[0])-1
  131.  
  132. for col in range(0,column_count):
  133. #global column_values
  134. # Generate the list of different values in
  135. # this column
  136. column_values={}
  137. for row in rows:
  138. column_values[row[col]]=1
  139.  
  140.  
  141. for value in column_values.keys():
  142. (set1,set2)=divideset(rows,col,value)
  143.  
  144. # Information gain
  145. p=float(len(set1))/len(rows)
  146. gain=current_score-p*scoref(set1)-(1-p)*scoref(set2)
  147. if gain>best_gain and len(set1)>0 and len(set2)>0:
  148. best_gain=gain
  149. best_criteria=(col,value)
  150. best_sets=(set1,set2)
  151.  
  152. if best_gain>0:
  153. trueBranch=buildtree(best_sets[0],nivo=nivo+1)
  154. falseBranch=buildtree(best_sets[1],nivo=nivo+1)
  155. return decisionnode(col=best_criteria[0],value=best_criteria[1],
  156. tb=trueBranch, fb=falseBranch,nivo=nivo) #tekovno nivo i ne treba +1
  157. else:
  158. return decisionnode(results=uniquecounts(rows),nivo=nivo)
  159.  
  160. def buildtree2(rows, scoref=entropy):
  161. if len(rows)==0: return decisionnode()
  162. current_score=scoref(rows)
  163.  
  164. # Set up some variables to track the best criteria
  165. best_gain=0.0
  166. best_col=None
  167. best_value=None
  168. best_ts=None
  169. best_fs=None
  170.  
  171. column_count=len(rows[0])-1
  172. for col in range(0,column_count):
  173. # Generate the list of different values in
  174. # this column
  175. column_values={}
  176. for row in rows:
  177. column_values[row[col]]=1
  178. # print row[col]
  179. # print
  180. # print col,column_values.keys()
  181. # Now try dividing the rows up for each value
  182. # in this column
  183. for value in column_values.keys():
  184. (set1,set2)=divideset(rows,col,value)
  185.  
  186. # Information gain
  187. p=float(len(set1))/len(rows)
  188. gain=current_score-p*scoref(set1)-(1-p)*scoref(set2)
  189. print col, value, gain, len(set1), len(set2)
  190. # print set1, set2, gain
  191. if gain>best_gain and len(set1)>0 and len(set2)>0:
  192. best_gain=gain
  193. best_col=col
  194. best_value=value
  195. best_ts=set1
  196. best_fs=set2
  197. best_criteria=(col,value)
  198. best_sets=(set1,set2)
  199. print
  200.  
  201. # return
  202. print best_gain, best_col, best_value
  203. # Create the subbranches
  204. if best_gain>0:
  205. trueBranch=buildtree(best_sets[0])
  206. falseBranch=buildtree(best_sets[1])
  207. return decisionnode(col=best_criteria[0],value=best_criteria[1],
  208. tb=trueBranch, fb=falseBranch)
  209. trueBranch=buildtree(best_ts)
  210. falseBranch=buildtree(best_fs)
  211. return decisionnode(col=best_col,value=best_value,
  212. tb=trueBranch, fb=falseBranch)
  213. else:
  214. return decisionnode(results=uniquecounts(rows))
  215.  
  216. def printtree(tree,indent=''):
  217. if tree.results!=None:
  218. print str(tree.results)
  219. else:
  220. print str(tree.col)+':'+str(tree.value)+'? Level='+str(tree.nivo)
  221. print indent+'T->',
  222. printtree(tree.tb,indent+' ')
  223. print indent+'F->',
  224. printtree(tree.fb,indent+' ')
  225.  
  226.  
  227. #t=buildtree2(my_data)
  228.  
  229. def classify(observation,tree):
  230. if tree.results!=None:
  231. return tree.results
  232. else:
  233. vrednost=observation[tree.col]
  234. branch=None
  235.  
  236. if isinstance(vrednost,int) or isinstance(vrednost,float):
  237. if vrednost>=tree.value: branch=tree.tb
  238. else: branch=tree.fb
  239. else:
  240. if vrednost==tree.value: branch=tree.tb
  241. else: branch=tree.fb
  242.  
  243. return classify(observation,branch)
  244.  
  245.  
  246. if __name__ == "__main__":
  247. # referrer='slashdot'
  248. # location='US'
  249. # readFAQ='no'
  250. # pagesVisited=19
  251. # serviceChosen='None'
  252.  
  253. referrer=input()
  254. location=input()
  255. readFAQ=input()
  256. pagesVisited=input()
  257. serviceChosen=input()
  258.  
  259. testCase=[referrer, location, readFAQ, pagesVisited, serviceChosen]
  260. trainingData.append(testCase)
  261. t=buildtree(trainingData)
  262. printtree(t)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement