Untitled

#!/usr/bin/env python
# coding: utf-8

# In[64]:


import os
import numpy as np
import csv


# In[65]:


home_dir = (os.path.expanduser("~"))
print(home_dir)


# In[66]:


data_filename = ".\ionosphere.data.txt"
print(data_filename)


# In[67]:


X = np.zeros((351, 34), dtype='float')
y = np.zeros((351,), dtype='bool')


# In[68]:


with open(data_filename, 'r') as input_file:
    reader = csv.reader(input_file)
    for i, row in enumerate(reader):
        # Get the data, converting each item to a float
        data = [float(datum) for datum in row[:-1]]
        # Set the appropriate row in our dataset
        X[i] = data
        # 1 if the class is 'g', 0 otherwise
        y[i] = row[-1] == 'g'


# In[69]:


print(X[1][-1])


# In[70]:


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=14)
print("There are {} samples in the training dataset".format(X_train.shape[0]))
print("There are {} samples in the testing dataset".format(X_test.shape[0]))
print("Each sample has {} features".format(X_train.shape[1]))


# In[71]:


from sklearn.neighbors import KNeighborsClassifier
estimator = KNeighborsClassifier()
print(estimator)


# In[72]:


estimator.fit(X_train, y_train)


# In[73]:


y_predicted = estimator.predict(X_test)
accuracy = np.mean(y_test == y_predicted) * 100
print("The accuracy is {0:.1f}%".format(accuracy))


# In[74]:


# uses Stratified K-fold
from sklearn.model_selection import cross_val_score


# In[76]:


scores = cross_val_score(estimator, X, y, scoring='accuracy', cv=3)
#average_accuracy = np.mean(scores) * 100
#print("The average accuracy is {0:.1f}%".format(average_accuracy))


# In[77]:


#setting parameters
avg_scores = []
all_scores = []
parameter_values = list(range(1, 21))  # Including 20
for n_neighbors in parameter_values:
    estimator = KNeighborsClassifier(n_neighbors=n_neighbors)
    scores = cross_val_score(estimator, X, y, scoring='accuracy', cv=3)
    avg_scores.append(np.mean(scores))
    all_scores.append(scores)


# In[55]:


get_ipython().run_line_magic('matplotlib', 'inline')


# In[56]:


from matplotlib import pyplot as plt
plt.plot(parameter_values, avg_scores, '-o')


# In[57]:


x_broken = np.array(X)
x_broken[:,::2] /=10


# In[58]:


esimator = KNeighborsClassifier()
original_scores = cross_val_score(esimator, X, y, scoring='accuracy')
print("The original average accuracy for is {0:.1f}%".format(np.mean(original_scores) * 100))
broken_scores = cross_val_score(esimator, x_broken,y, scoring='accuracy')
print("The broken average accuracy for is  {0:.1f}%".format(np.mean(broken_scores) * 100))


# In[59]:


from sklearn.preprocessing import MinMaxScaler
x_transformed = MinMaxScaler().fit_transform(x_broken)
estimator = KNeighborsClassifier()
transformed_scores = cross_val_score(esimator, x_transformed, y, scoring='accuracy')
print("The average accuracy for is {0:.1f}%".format(np.mean(transformed_scores) * 100))


# In[60]:


from sklearn.preprocessing import MinMaxScaler
x_transformed = MinMaxScaler().fit_transform(x_broken)
estimator = KNeighborsClassifier()
transformed_scores = cross_val_score(esimator, x_transformed, y, scoring='accuracy')
print("The average accuracy for is {0:.1f}%".format(np.mean(transformed_scores) * 100))


# In[61]:


from sklearn.pipeline import Pipeline


# In[62]:


scaling_pipeline = ([
    ('scale', MinMaxScaler()),
    ('predict', KNeighborsClassifier())])
print(scaling_pipeline)


# In[63]:


scores = cross_val_score(scaling_pipeline, x_broken, y, scoring='accuracy')
print("the pipeline scored an average accuracy for is {0:.1f}%".format(np.mean(transformed_scores)* 100))


# In[ ]: