Advertisement
Guest User

Untitled

a guest
Jan 28th, 2020
116
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.75 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Wed Jan 22 11:54:55 2020
  4.  
  5. @author: abrown191
  6. """
  7. #Data preprocessing
  8. #Importing the Libraries
  9.  
  10. import numpy as np #Importing of Library and Adding and Alias
  11. import matplotlib.pyplot as plt #as plt!
  12. import pandas as pd
  13.  
  14. #Importing the Dataset
  15. #SET A WORKING DIRECTORY FOLDER!!
  16. # file explorer - machine learning a-z - part 1 - Section2etc
  17. # PRESS F5 to set directory
  18. dataset = pd.read_csv('Data.csv')#remember your quotes!
  19. x = dataset.iloc[:, :-1].values #[LINES, COLUMNS] : = ALL
  20. #dependent variable vector
  21. y = dataset.iloc[:, 3].values
  22.  
  23. #Missing Data Tutorial -replacing NaNs
  24. #Replace missing data with the mean of the columns that the mean is in.
  25. # CTRL+I = Info on clicked value
  26.  
  27. # ------------------------------------------------------------------------MISSING DATA
  28.  
  29. from sklearn.impute import SimpleImputer #import the imputer
  30. imputer = SimpleImputer(missing_values = np.NaN, strategy = 'mean') #characterise the imputer
  31. imputer = imputer.fit(x[:,1:3]) #Upper band +1 to maximum #fit the imputer to the relevant parts
  32. x[:, 1:3] = imputer.transform(x[:, 1:3]) #set x to display the imputer
  33.  
  34.  
  35. #------------------------------------------------------------------------CATEGORICAL DATA
  36.  
  37. from sklearn.preprocessing import LabelEncoder
  38. labelencoder_x= LabelEncoder()
  39. x[:, 0]=labelencoder_x.fit_transform(x[:, 0])
  40.  
  41. #------------------------------------------------------------------------DUMMY ENCODING
  42.  
  43. #---ENCODING MEANS ASSIGNING A NUMBER TO A NAME
  44.  
  45. from sklearn.preprocessing import OneHotEncoder
  46. onehotencoder = OneHotEncoder(categorical_features = [0]) #The Column you wish to split based on answers.
  47.  
  48. x=onehotencoder.fit_transform(x).toarray() #You dont need to sub specify as you already have.
  49.  
  50. #Tranforms Y into an encoded variable vector
  51. labelencoder_y= LabelEncoder()
  52. y=labelencoder_y.fit_transform(y)
  53.  
  54. #-----------------------------------------------------------------------SPLITTING DATASETS -IMPORTANT
  55. #from sklearn.cross_validation import train_test_split
  56. from sklearn.model_selection import train_test_split
  57. x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state = 0)#0.5 = 50% of the Data goes to the test. The other 50% goes to the train.
  58. #You should usually pick 0.2, for 20% to the test set.
  59.  
  60. #The training set allows you to train the model to predict the information accurately.
  61. #The test set allows you to ensure what the training set has learned is carried out accurately.
  62.  
  63. #------------------------------------------------------------------------FEATURE SCALING - IMPORTANT
  64. #Models are based on euclidean distances. Even if models arn't the we still need to do feature scaling.
  65.  
  66. #Feature Scaling: Feature scaling helps normalise data so that it can then be predicted.
  67. from sklearn.preprocessing import StandardScaler
  68. sc_x = StandardScaler()
  69. x_train = sc_x.fit_transform(x_train) #Scale the training data
  70. x_test = sc_x.fit_transform(x_test)#Scale the testing data
  71.  
  72. #--------------------------------------------------------------DATA PREPROCESSING TEMPLATE
  73.  
  74. #importing libraries
  75. import numpy as np #Importing of Library and Adding and Alias
  76. import matplotlib.pyplot as plt #as plt!
  77. import pandas as pd
  78.  
  79. #importing dataset
  80. dataset = pd.read_csv('Data.csv')#remember your quotes!
  81. x = dataset.iloc[:, :-1].values
  82. y = dataset.iloc[:, 3].values
  83.  
  84. #splitting dataset into train and test
  85. from sklearn.model_selection import train_test_split
  86. x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state = 0)
  87.  
  88. #feature scaling
  89. """
  90. from sklearn.preprocessing import StandardScaler
  91. sc_x = StandardScaler()
  92. x_train = sc_x.fit_transform(x_train) #Scale the training data
  93. x_test = sc_x.fit_transform(x_test)#Scale the testing data
  94. """
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement