Untitled

def format_data(df):
    # Targets are final grade of student
    labels = df['G3']
    # Drop the school and the grades from features
    df = df.drop(columns=['school', 'G1', 'G2', 'G3'])

    # One-Hot Encoding of Categorical Variables
    df = pd.get_dummies(df)

    df['y'] = list(labels)

    most_correlated = df.corr().abs()['y'].sort_values(ascending=False)

    # Keep correlations greater than 0.2 in absolute value
    most_correlated = most_correlated[most_correlated >= 0.2][1:]

    df = df.ix[:, most_correlated.index]

    # Already encode the higher education column in `higher_yes`
    df = df.drop(columns = 'higher_no')

    # Split into training/testing sets with 25% split
    X_train, X_test, y_train, y_test = train_test_split(df, labels,
                                                        test_size = 0.25,
                                                       random_state=42)

    # Return the training and testing data
    return X_train, X_test, y_train, y_test