Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import pandas as pd
- from sklearn import preprocessing
- class decisionnode:
- def __init__(self,col=-1,value=None,results=None,tb=None,fb=None):
- self.col=col
- self.value=value
- self.results=results
- self.tb=tb
- self.fb=fb
- # Divides a set on a specific column. Can handle numeric
- # or nominal values
- def divideset(rows,column,value):
- # Make a function that tells us if a row is in
- # the first group (true) or the second group (false)
- split_function=None
- if isinstance(value,int) or isinstance(value,float):
- split_function=lambda row:row[column]>=value
- else:
- split_function=lambda row:row[column]==value
- # Divide the rows into two sets and return them
- set1=[row for row in rows if split_function(row)]
- set2=[row for row in rows if not split_function(row)]
- return (set1,set2)
- # Create counts of possible results (the last column of
- # each row is the result)
- def uniquecounts(rows):
- results={}
- for row in rows:
- # The result is the last column
- r=row[len(row)-1]
- if r not in results: results[r]=0
- results[r]+=1
- return results
- def get_t(train_x, colInstance):
- return int((max(train_x[:, colInstance]) + min(train_x[:, colInstance])) / 2)
- def get_above_below(train_x, colInstance):
- threshold = get_t(train_x, colInstance)
- p_below = sum([train_x[i, colInstance] <= threshold for i in range(len(train_x))])
- p_above = len(train_x) - p_below
- p_below /= len(train_x)
- p_above /= len(train_x)
- return p_below, p_above
- def printtree(tree,indent=''):
- # Is this a leaf node?
- if tree.results!=None:
- print(str(tree.results))
- else:
- # Print the criteria
- print(str(tree.col)+':'+str(tree.value)+'? ')
- # Print the branches
- print(indent+'T->', end=' ')
- printtree(tree.tb,indent+' ')
- print(indent+'F->', end=' ')
- printtree(tree.fb,indent+' ')
- def buildtree(rows):
- if len(rows)==0: return decisionnode()
- # Set up some variables to track the best criteria
- YTrain = rows["Target"]
- XTrain = rows.drop("Target",1)
- for colInstance in range(len(XTrain[0])):
- t = get_t(XTrain, colInstance)
- a , b = get_above_below(XTrain, colInstance)
- # Now that we have the threshold values, we need to compare with the summation and see if it is leser.
- for i in range(len(train_x)):
- if XTrain[i, colInstance] <= threshold: below_t[train_y[i]] += 1
- else: above_t[train_y[i]] += 1
- # For each class row, we need to check the splitting class and replace entropy with the given condition (functionsa name above)
- # Also, instead of the Informtion Gain factor, we will have the probablistic methdd given which has been explained in psuedocode.
- # if below_t[0] > below_t[1]: below_t = below_t[1]
- # else: below_t = below_t[0]
- # if above_t[0] > above_t[1]: above_t = above_t[1]
- # else: above_t = above_t[0]
- # if below*below_t + above*above_t < split_class_val:
- # split_class_val = below*below_t + above*above_t
- # split_class = curr_col
- # return split_class
- # This should work for split class identifier.
- def encode_target(df, target_column):
- """Add column to df with integers for the target.
- Args
- ----
- df -- pandas DataFrame.
- target_column -- column to map to int, producing
- new Target column.
- Returns
- -------
- df_mod -- modified DataFrame.
- targets -- list of target names.
- """
- df_mod = df.copy()
- targets = df_mod[target_column].unique()
- map_to_int = {name: n for n, name in enumerate(targets)}
- df_mod["Target"] = df_mod[target_column].replace(map_to_int)
- return (df_mod, targets)
- def main():
- data = pd.read_csv('Training.csv', sep=',', header=None)
- # Shuffle data.
- # data = data.iloc[np.random.permutation(len(data))]
- # Eliminate invalid data.
- headers = data.iloc[0]
- data = data[1:]
- data = data.rename(columns = headers)
- data['Smoking_Pack-Years'] = data['Smoking_Pack-Years'].astype(float)
- data['Smoking_Pack-Years'].fillna((data['Smoking_Pack-Years'].mean()), inplace=True)
- data['Pathological_grade'] = data['Pathological_grade'].astype(str)
- #data = data.replace(['?'], [np.nan])
- prep = preprocessing.LabelEncoder()
- # print( data.dtypes)
- for cols in data.columns:
- # print (cols)
- data[cols]=prep.fit_transform(data[cols])
- (data,targets) = encode_target(data, "KM_Overall_survival_censor")
- # print(data)
- # Now we remove the column from the initial table.
- data = data.drop("KM_Overall_survival_censor",1)
- # print(data)
- Train = data[:140]
- Test = data[140:]
- Train = Train.sample(frac=1)
- # Convert the numpy data structure.
- tree=buildtree(Train.values.tolist())
- print((tree.col))
- print((tree.value))
- print((tree.results))
- print("")
- print((tree.tb.col))
- print((tree.tb.value))
- print((tree.tb.results))
- print("")
- print((tree.tb.tb.col))
- print((tree.tb.tb.value))
- print((tree.tb.tb.results))
- print("")
- print((tree.tb.fb.col))
- print((tree.tb.fb.value))
- print((tree.tb.fb.results))
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement