Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #! /usr/bin/python
- import sys
- from numpy import *
- # data sets are { {sepal length, sepal width, petal length, petal width} ...}
- # one data set per class
- #returns the average euclidian distance between each vector in the data set
- #data = [ [vector], [vector 2]....]
- def euclid_dist(data):
- from scipy import spatial as spat
- count = 0
- avg = 0
- for i in range(len(data)):
- for j in range(i + 1, len(data)):
- if(j == len(data)):
- break
- avg += spat.distance.euclidean(data[i], data[j])
- count += 1
- return avg / count
- #returns the average mahalanobis distance between each vector in the data set
- #data = [ [vector], [vector 2]....]
- def mahalanobis(data):
- import numpy as np;
- import scipy.spatial.distance;
- avg = 0
- count = 0
- covar = np.cov(data, rowvar=0);
- invcovar = np.linalg.inv(covar)
- for i in range(len(data)):
- for j in range(i + 1, len(data)):
- if(j == len(data)):
- break
- avg += scipy.spatial.distance.mahalanobis(data[i], data[j], invcovar)
- count += 1
- return avg / count
- def hausdorf(data):
- return data
- #normalizes data over columns. Expects a list of lists. each sub list needs to be of the same size
- #this function can be found at http://stackoverflow.com/questions/8904694/how-to-normalize-a-2-dimensional-numpy-array-in-python-less-verbose
- def normalize(data):
- import numpy as np
- row_sums = data.sum(axis=1)
- norm_data = np.zeros((50, 4))
- for i, (row, row_sum) in enumerate(zip(data, row_sums)):
- norm_data[i,:] = row / row_sum
- return norm_data
- def main():
- if(len(sys.argv) < 2):
- print("No file name to open. USAGE: ./" + argv[0] + " \"file_name\"")
- exit(1);
- try:
- data_file = open(sys.argv[1])
- except IOError:
- print("Could not open file! Closing " + argv[0])
- with data_file:
- data = [] # each element in data is a data set of the form specified above
- norm = []
- data_set = []
- classification = "Iris-setosa"
- for line in data_file:
- line_data = []
- comma_pos = 0
- while True:
- comma_pos = line.find(',')
- if( comma_pos == -1):
- if(line.strip() != classification):
- classification = line.strip()
- data.append(array(data_set))
- data_set = []
- break;
- line_data.append(float(line[:comma_pos]))
- line = line[comma_pos + 1:]
- data_set.append(line_data)
- #for data_set in data:
- # for line in data_set:
- # print(line)
- #print("start of new set")
- norm.append( normalize(data[0]))
- norm.append( normalize(data[1]))
- norm.append( normalize(data[2]))
- print(euclid_dist( data[0]))
- print(euclid_dist( data[1]))
- print(euclid_dist( data[2]))
- print(euclid_dist( norm[0]))
- print(euclid_dist( norm[1]))
- print(euclid_dist( norm[2]))
- print(mahalanobis( data[0]))
- print(mahalanobis( data[1]))
- print(mahalanobis( data[1]))
- print("")
- print(mahalanobis( norm[0]))
- print(mahalanobis( norm[1]))
- print(mahalanobis( norm[2]))
- #hausdorf( data[0])
- #hausdorf( data[1])
- #hausdorf( data[1])
- #hausdorf( norm[0])
- #hausdorf( norm[1])
- #hausdorf( norm[2])
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement