Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def format_data(df):
- # Targets are final grade of student
- labels = df['G3']
- # Drop the school and the grades from features
- df = df.drop(columns=['school', 'G1', 'G2', 'G3'])
- # One-Hot Encoding of Categorical Variables
- df = pd.get_dummies(df)
- df['y'] = list(labels)
- most_correlated = df.corr().abs()['y'].sort_values(ascending=False)
- # Keep correlations greater than 0.2 in absolute value
- most_correlated = most_correlated[most_correlated >= 0.2][1:]
- df = df.ix[:, most_correlated.index]
- # Already encode the higher education column in `higher_yes`
- df = df.drop(columns = 'higher_no')
- # Split into training/testing sets with 25% split
- X_train, X_test, y_train, y_test = train_test_split(df, labels,
- test_size = 0.25,
- random_state=42)
- # Return the training and testing data
- return X_train, X_test, y_train, y_test
Add Comment
Please, Sign In to add comment