Advertisement
Guest User

Untitled

a guest
Aug 30th, 2016
71
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.25 KB | None | 0 0
  1. import numpy as np
  2. import pandas as pd
  3. from sklearn import preprocessing
  4.  
  5. class decisionnode:
  6. def __init__(self,col=-1,value=None,results=None,tb=None,fb=None):
  7. self.col=col
  8. self.value=value
  9. self.results=results
  10. self.tb=tb
  11. self.fb=fb
  12.  
  13. # Divides a set on a specific column. Can handle numeric
  14. # or nominal values
  15. def divideset(rows,column,value):
  16. # Make a function that tells us if a row is in
  17. # the first group (true) or the second group (false)
  18. split_function=None
  19. if isinstance(value,int) or isinstance(value,float):
  20. split_function=lambda row:row[column]>=value
  21. else:
  22. split_function=lambda row:row[column]==value
  23.  
  24. # Divide the rows into two sets and return them
  25. set1=[row for row in rows if split_function(row)]
  26. set2=[row for row in rows if not split_function(row)]
  27. return (set1,set2)
  28.  
  29.  
  30. # Create counts of possible results (the last column of
  31. # each row is the result)
  32. def uniquecounts(rows):
  33. results={}
  34. for row in rows:
  35. # The result is the last column
  36. r=row[len(row)-1]
  37. if r not in results: results[r]=0
  38. results[r]+=1
  39. return results
  40.  
  41.  
  42. def get_t(train_x, colInstance):
  43. return int((max(train_x[:, colInstance]) + min(train_x[:, colInstance])) / 2)
  44.  
  45. def get_above_below(train_x, colInstance):
  46. threshold = get_t(train_x, colInstance)
  47. p_below = sum([train_x[i, colInstance] <= threshold for i in range(len(train_x))])
  48. p_above = len(train_x) - p_below
  49. p_below /= len(train_x)
  50. p_above /= len(train_x)
  51. return p_below, p_above
  52.  
  53.  
  54. def printtree(tree,indent=''):
  55. # Is this a leaf node?
  56. if tree.results!=None:
  57. print(str(tree.results))
  58. else:
  59. # Print the criteria
  60. print(str(tree.col)+':'+str(tree.value)+'? ')
  61.  
  62. # Print the branches
  63. print(indent+'T->', end=' ')
  64. printtree(tree.tb,indent+' ')
  65. print(indent+'F->', end=' ')
  66. printtree(tree.fb,indent+' ')
  67.  
  68.  
  69. def buildtree(rows):
  70.  
  71. if len(rows)==0: return decisionnode()
  72.  
  73. # Set up some variables to track the best criteria
  74. YTrain = rows["Target"]
  75. XTrain = rows.drop("Target",1)
  76.  
  77. for colInstance in range(len(XTrain[0])):
  78. t = get_t(XTrain, colInstance)
  79. a , b = get_above_below(XTrain, colInstance)
  80.  
  81. # Now that we have the threshold values, we need to compare with the summation and see if it is leser.
  82. for i in range(len(train_x)):
  83. if XTrain[i, colInstance] <= threshold: below_t[train_y[i]] += 1
  84. else: above_t[train_y[i]] += 1
  85.  
  86. # For each class row, we need to check the splitting class and replace entropy with the given condition (functionsa name above)
  87. # Also, instead of the Informtion Gain factor, we will have the probablistic methdd given which has been explained in psuedocode.
  88.  
  89. # if below_t[0] > below_t[1]: below_t = below_t[1]
  90. # else: below_t = below_t[0]
  91. # if above_t[0] > above_t[1]: above_t = above_t[1]
  92. # else: above_t = above_t[0]
  93. # if below*below_t + above*above_t < split_class_val:
  94. # split_class_val = below*below_t + above*above_t
  95. # split_class = curr_col
  96. # return split_class
  97. # This should work for split class identifier.
  98.  
  99.  
  100. def encode_target(df, target_column):
  101. """Add column to df with integers for the target.
  102.  
  103. Args
  104. ----
  105. df -- pandas DataFrame.
  106. target_column -- column to map to int, producing
  107. new Target column.
  108.  
  109. Returns
  110. -------
  111. df_mod -- modified DataFrame.
  112. targets -- list of target names.
  113. """
  114. df_mod = df.copy()
  115. targets = df_mod[target_column].unique()
  116. map_to_int = {name: n for n, name in enumerate(targets)}
  117. df_mod["Target"] = df_mod[target_column].replace(map_to_int)
  118. return (df_mod, targets)
  119.  
  120. def main():
  121. data = pd.read_csv('Training.csv', sep=',', header=None)
  122. # Shuffle data.
  123. # data = data.iloc[np.random.permutation(len(data))]
  124. # Eliminate invalid data.
  125. headers = data.iloc[0]
  126. data = data[1:]
  127. data = data.rename(columns = headers)
  128. data['Smoking_Pack-Years'] = data['Smoking_Pack-Years'].astype(float)
  129. data['Smoking_Pack-Years'].fillna((data['Smoking_Pack-Years'].mean()), inplace=True)
  130. data['Pathological_grade'] = data['Pathological_grade'].astype(str)
  131. #data = data.replace(['?'], [np.nan])
  132. prep = preprocessing.LabelEncoder()
  133. # print( data.dtypes)
  134. for cols in data.columns:
  135. # print (cols)
  136. data[cols]=prep.fit_transform(data[cols])
  137. (data,targets) = encode_target(data, "KM_Overall_survival_censor")
  138. # print(data)
  139. # Now we remove the column from the initial table.
  140. data = data.drop("KM_Overall_survival_censor",1)
  141. # print(data)
  142. Train = data[:140]
  143. Test = data[140:]
  144. Train = Train.sample(frac=1)
  145.  
  146. # Convert the numpy data structure.
  147. tree=buildtree(Train.values.tolist())
  148.  
  149. print((tree.col))
  150. print((tree.value))
  151. print((tree.results))
  152. print("")
  153. print((tree.tb.col))
  154. print((tree.tb.value))
  155. print((tree.tb.results))
  156. print("")
  157. print((tree.tb.tb.col))
  158. print((tree.tb.tb.value))
  159. print((tree.tb.tb.results))
  160. print("")
  161. print((tree.tb.fb.col))
  162. print((tree.tb.fb.value))
  163. print((tree.tb.fb.results))
  164.  
  165.  
  166. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement