Untitled

from __future__ import division
import csv
import numpy as np
import urllib
import math
import random

X = []
y = []
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"

numClasses=3
pArray = []
classes = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']


def loadData():
    global X
    global y
    # download the file
    raw_data = urllib.urlopen(url)

    # load the CSV file into features (X) and class (y)
    for row in csv.reader(raw_data):
        if row:
            y.append(row[-1])
            X.append(row[:-1])

    X = np.asarray(X)
    y = np.asarray(y)
    X = X.astype(np.float)
    [m, n] = X.shape


    class1 = X[0:50]
    class2 = X[51:100]
    class3 = X[101:150]

    # divide training set in the 3 classes
    classes = [class1, class2, class3]

    #calculate the probability
    calculateMeanandVar(classes, n)


def calculateMeanandVar(classes, n):
    for i in range(0,3):
        pArray.append([])
        for x in range(0,n):
            mean = np.mean(classes[i][:,x])
            var = np.var(classes[i][:,x])
            pArray[i].append([mean, var])
    classify()

def calculateProbability(mean, stdev, x):
    exponent = math.exp(-(math.pow(x-mean,2)/(2*stdev)))
    return (1.0 / (math.sqrt((2.0*math.pi) * stdev))) * exponent

def classify():
    my_randoms = random.sample(xrange(150), 100)
    corr=0
    for q in my_randoms:
        x1 = X[q,:]
        results=[]

        for i in range(numClasses):

            results.append([])

            # calculate the probabilities of the features of q being class i
            pA = 0.3333
            p0 = calculateProbability(pArray[i][0][0], pArray[i][0][1], x1[0])
            p1 = calculateProbability(pArray[i][1][0], pArray[i][1][1], x1[1])
            p2 = calculateProbability(pArray[i][2][0], pArray[i][2][1], x1[2])
            p3 = calculateProbability(pArray[i][3][0], pArray[i][3][1], x1[3])
            p = p0 * p1 * p2 * p3 * pA
            results[i].append(p)

        # get the max value and index (which class row X[q,:] has the highest
        # probability of belonging to)
        max_value = max(results)
        max_index = results.index(max_value)

        if (classes[max_index] == y[max_index]):
            corr +=1


    print('\nPercentage of rightly classified rows: {0}%').format(corr)


loadData()
# loadTest()