Untitled

import numpy as np
import pandas as pd
from sklearn import preprocessing

class decisionnode:
    def __init__(self,col=-1,value=None,results=None,tb=None,fb=None):
        self.col=col
        self.value=value
        self.results=results
        self.tb=tb
        self.fb=fb

# Divides a set on a specific column. Can handle numeric
# or nominal values
def divideset(rows,column,value):
    # Make a function that tells us if a row is in
    # the first group (true) or the second group (false)
    split_function=None
    if isinstance(value,int) or isinstance(value,float):
        split_function=lambda row:row[column]>=value
    else:
        split_function=lambda row:row[column]==value

    # Divide the rows into two sets and return them
    set1=[row for row in rows if split_function(row)]
    set2=[row for row in rows if not split_function(row)]
    return (set1,set2)


# Create counts of possible results (the last column of
# each row is the result)
def uniquecounts(rows):
    results={}
    for row in rows:
        # The result is the last column
        r=row[len(row)-1]
        if r not in results: results[r]=0
        results[r]+=1
    return results


def get_t(train_x, colInstance):
    return int((max(train_x[:, colInstance]) + min(train_x[:, colInstance])) / 2)

def get_above_below(train_x, colInstance):
    threshold = get_t(train_x, colInstance)
    p_below = sum([train_x[i, colInstance] <= threshold for i in range(len(train_x))])
    p_above = len(train_x) - p_below
    p_below /= len(train_x)
    p_above /= len(train_x)
    return p_below, p_above


def printtree(tree,indent=''):
    # Is this a leaf node?
    if tree.results!=None:
        print(str(tree.results))
    else:
        # Print the criteria
        print(str(tree.col)+':'+str(tree.value)+'? ')

        # Print the branches
        print(indent+'T->', end=' ')
        printtree(tree.tb,indent+'  ')
        print(indent+'F->', end=' ')
        printtree(tree.fb,indent+'  ')


def buildtree(rows):

    if len(rows)==0: return decisionnode()

    # Set up some variables to track the best criteria
    YTrain = rows["Target"]
    XTrain = rows.drop("Target",1)

    for colInstance in range(len(XTrain[0])):
        t = get_t(XTrain, colInstance)
        a , b = get_above_below(XTrain, colInstance)

        # Now that we have the threshold values, we need to compare with the summation and see if it is leser.
        for i in range(len(train_x)):
            if XTrain[i, colInstance] <= threshold: below_t[train_y[i]] += 1
            else: above_t[train_y[i]] += 1

    # For each class row, we need to check the splitting class and replace entropy with the given condition (functionsa name above)
    # Also, instead of the Informtion Gain factor, we will have the probablistic methdd given which has been explained in psuedocode.

            #     if below_t[0] > below_t[1]: below_t = below_t[1]
            #     else: below_t = below_t[0]
            #     if above_t[0] > above_t[1]: above_t = above_t[1]
            #     else: above_t = above_t[0]
            #     if below*below_t + above*above_t < split_class_val:
            #         split_class_val = below*below_t + above*above_t
            #         split_class = curr_col
            # return split_class
    # This should work for split class identifier.


def encode_target(df, target_column):
    """Add column to df with integers for the target.

    Args
    ----
    df -- pandas DataFrame.
    target_column -- column to map to int, producing
                     new Target column.

    Returns
    -------
    df_mod -- modified DataFrame.
    targets -- list of target names.
    """
    df_mod = df.copy()
    targets = df_mod[target_column].unique()
    map_to_int = {name: n for n, name in enumerate(targets)}
    df_mod["Target"] = df_mod[target_column].replace(map_to_int)
    return (df_mod, targets)

def main():
    data = pd.read_csv('Training.csv', sep=',', header=None)
    # Shuffle data.
    # data = data.iloc[np.random.permutation(len(data))]
    # Eliminate invalid data.
    headers = data.iloc[0]
    data = data[1:]
    data = data.rename(columns = headers)
    data['Smoking_Pack-Years'] = data['Smoking_Pack-Years'].astype(float)
    data['Smoking_Pack-Years'].fillna((data['Smoking_Pack-Years'].mean()), inplace=True)
    data['Pathological_grade'] = data['Pathological_grade'].astype(str)
    #data = data.replace(['?'], [np.nan])
    prep = preprocessing.LabelEncoder()
    # print( data.dtypes)
    for cols in data.columns:
        # print (cols)
        data[cols]=prep.fit_transform(data[cols])
    (data,targets) = encode_target(data, "KM_Overall_survival_censor")
    # print(data)
    # Now we remove the column from the initial table.
    data = data.drop("KM_Overall_survival_censor",1)
    # print(data)
    Train = data[:140]
    Test = data[140:]
    Train = Train.sample(frac=1)

    # Convert the numpy data structure.
    tree=buildtree(Train.values.tolist())

    print((tree.col))
    print((tree.value))
    print((tree.results))
    print("")
    print((tree.tb.col))
    print((tree.tb.value))
    print((tree.tb.results))
    print("")
    print((tree.tb.tb.col))
    print((tree.tb.tb.value))
    print((tree.tb.tb.results))
    print("")
    print((tree.tb.fb.col))
    print((tree.tb.fb.value))
    print((tree.tb.fb.results))


main()