Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- '''
- Created on 21-05-2011
- @author: jakub
- '''
- import numpy as np
- from scipy.spatial.distance import mahalanobis, euclidean
- #===============================================================================
- # CPoint
- # @note: A class representating point in n-dimmensional enviroment
- #===============================================================================
- class CPoint(object):
- #===========================================================================
- # __init__
- # @param coords: coords of point
- # @param n : dimmensions of a point
- # @note: creates a CPoint object
- #===========================================================================
- def __init__(self, cords):
- self.coords = cords
- self.n = len(cords)
- #===========================================================================
- # __repr__
- # @return: string representaion of a point
- #===========================================================================
- def __repr__(self):
- return "P" + str(self.coords)
- #===============================================================================
- # CCluster
- # @note: A class representating a cluster of points in n-dimmentional enviroment
- #===============================================================================
- class CCluster(object):
- #===========================================================================
- # __init__
- # @param points: set of points which of is cluster made
- #===========================================================================
- def __init__(self, points):
- if len(points) == 0:
- raise Exception("Empty cluster")
- self.points = points
- self.nDim = points[0].n
- self.n = len(self.points)
- for point in points:
- if point.n != self.nDim:
- raise Exception("Wrong points dimmensions")
- #=======================================================================
- # __repr__
- # @return: string representaion of a cluster
- #=======================================================================
- def __repr__(self):
- return "C" + str(self.points)
- def merge(self, secondCluster):
- self.points.extend(secondCluster)
- self.n = len(self.points)
- if self.nDim!=secondCluster.nDim:
- raise Exception("Wrong points dimmensions")
- #===============================================================================
- # CMetric
- # @note: Class representating various metrics
- #===============================================================================
- class CMetric(object):
- #===========================================================================
- # __init__
- # @param name: string which tells which metric are we gonna use
- # "euclides", "mahalanobis"
- #===========================================================================
- def __init__(self, name):
- self.name = name
- #===========================================================================
- # computeDistance
- # @param pointA: first point
- # @param pointB: second point
- # @return: distance between points A and B in given metric
- #===========================================================================
- def computeDistance(self, pointA, pointB):
- if pointA.n != pointB.n:
- raise Exception("Wrong points dimmensions")
- if self.name == "mahalanobis":
- V = np.cov(np.concatenate((pointA.coords, pointB.coords)).T)
- return mahalanobis(pointA.coords, pointB.coords, V)
- if self.name == "euclides":
- return euclidean(pointA.coords, pointB.coords)
- class CDistance:
- #===========================================================================
- # __init__
- # @param distanceString: one of the: max, min, pga (pair-group average), pgc (pair-group centroid), ward
- # @param metricString: one of the: euclides, mahalanobis
- #===========================================================================
- def __init__(self, distanceString, metricString):
- self.distanceString = distanceString
- self.metric = CMetric(metricString)
- #===========================================================================
- # getDistanceBetweenPoints
- # @param PointA: first point of cumputing
- # @param PointB: second point of cumputing
- # @return: distance with metric given in __init__
- #===========================================================================
- def getDistanceBetweenPoints(self, PointA, PointB):
- if (PointA.n != PointB.n):
- raise Exception("Wrong number of dimensions")
- return self.metric.computeDistance(PointA, PointB)
- #===========================================================================
- # __getDistanceArrayBetweenPointsFromClusters
- # @param ClustA: first cluster to compute
- # @param ClustB: second cluster to compute
- # @return: len of ClustA x len of ClustB array with distances between each point from them
- #===========================================================================
- def __getDistanceArrayBetweenPointsFromClusters(self, ClustA, ClustB):
- out = np.zeros((ClustA.n, ClustB.n))
- for i in range(ClustA.n):
- for j in range(ClustB.n):
- out[i][j] = self.getDistanceBetweenPoints(ClustA.points[i], ClustB.points[j])
- return out
- #===========================================================================
- # __getDistanceBetweenNearestElementsOfCluster
- # @param ClustA: first cluster to compute
- # @param ClustB: second cluster to compute
- # @return: floating point number with distance
- #===========================================================================
- def __getDistanceBetweenNearestElementsOfCluster(self, ClustA, ClustB):
- distancesMatrix = self.__getDistanceArrayBetweenPointsFromClusters(ClustA, ClustB)
- minVal = distancesMatrix[0][0]
- for i in range(ClustA.n):
- for j in range(ClustB.n):
- if (distancesMatrix[i][j] < minVal):
- minVal = distancesMatrix[i][j]
- return minVal
- #===========================================================================
- # __getDistanceBetweenFurthestElementsOfCluster
- # @param ClustA: first cluster to compute
- # @param ClustB: second cluster to compute
- # @return: floating point number with distance
- #===========================================================================
- def __getDistanceBetweenFurthestElementsOfCluster(self, ClustA, ClustB):
- distancesMatrix = self.__getDistanceArrayBetweenPointsFromClusters(ClustA, ClustB)
- maxVal = distancesMatrix[0][0]
- for i in range(ClustA.n):
- for j in range(ClustB.n):
- if (distancesMatrix[i][j] > maxVal):
- maxVal = distancesMatrix[i][j]
- return maxVal
- #===========================================================================
- # __getPairGroupAverageDistance
- # @param ClustA: first cluster to compute
- # @param ClustB: second cluster to compute
- # @return: floating point number with distance
- #===========================================================================
- def __getPairGroupAverageDistance(self, ClustA, ClustB):
- distancesMatrix = self.__getDistanceArrayBetweenPointsFromClusters(ClustA, ClustB)
- return np.average(distancesMatrix)
- def __getPairGroupCentroidDistance(self, clusterA, clusterB):
- centroidPointClusterA = self.getCentroid(clusterA)
- centroidPointClusterB = self.getCentroid(clusterB)
- return self.getDistanceBetweenPoints(centroidPointClusterA, centroidPointClusterB)
- def getCentroid(self, cluster):
- centroidPoint = []
- for dim in range(cluster.nDim):
- centroidPoint.append(0.0)
- for point in cluster.points:
- centroidPoint[dim] += float(point.coords[dim])
- centroidPoint[dim] = centroidPoint[dim] / len(cluster.points)
- centroidPoint = CPoint(centroidPoint)
- return centroidPoint
- def __getWardDistance(self, ClustA, ClustB):
- PointA = self.getCentroid(ClustA)
- PointB = self.getCentroid(ClustB)
- n1 = PointA.n;
- n2 = PointB.n
- if PointA.nDim != PointB.nDim :
- raise Exception("Wrong dimensions of clusters")
- coords = np.zeros(n1)
- for i in range(n1):
- coords[i] = PointA.coords[i] - PointB.coords[i]
- return np.dot(coords.transpose(), coords) * n1 * n2 / (n1 + n2)
- def computeDistance(self, clusterA, clusterB):
- if (self.distanceString == 'max'):
- return self.__getDistanceBetweenFurthestElementsOfCluster(clusterA, clusterB)
- elif (self.distanceString == 'min'):
- return self.__getDistanceBetweenNearestElementsOfCluster(clusterA, clusterB)
- elif (self.distanceString == 'pga'):
- return self.__getPairGroupAverageDistance(clusterA, clusterB)
- elif (self.distanceString == 'pgc'):
- return self.__getPairGroupCentroidDistance(clusterA, clusterB)
- elif (self.distanceString == 'ward'):
- return self.__getWardDistance(clusterA, clusterB)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement