Guest User

Untitled

a guest
Apr 19th, 2018
53
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.01 KB | None | 0 0
  1. def format_data(df):
  2. # Targets are final grade of student
  3. labels = df['G3']
  4. # Drop the school and the grades from features
  5. df = df.drop(columns=['school', 'G1', 'G2', 'G3'])
  6.  
  7. # One-Hot Encoding of Categorical Variables
  8. df = pd.get_dummies(df)
  9.  
  10. df['y'] = list(labels)
  11.  
  12. most_correlated = df.corr().abs()['y'].sort_values(ascending=False)
  13.  
  14. # Keep correlations greater than 0.2 in absolute value
  15. most_correlated = most_correlated[most_correlated >= 0.2][1:]
  16.  
  17. df = df.ix[:, most_correlated.index]
  18.  
  19. # Already encode the higher education column in `higher_yes`
  20. df = df.drop(columns = 'higher_no')
  21.  
  22. # Split into training/testing sets with 25% split
  23. X_train, X_test, y_train, y_test = train_test_split(df, labels,
  24. test_size = 0.25,
  25. random_state=42)
  26.  
  27. # Return the training and testing data
  28. return X_train, X_test, y_train, y_test
Add Comment
Please, Sign In to add comment