Advertisement
loubot

Ps10

Apr 19th, 2013
322
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.96 KB | None | 0 0
  1. # Problem Set 10
  2. # Name:
  3. # Collaborators:
  4. # Time:
  5.  
  6. #Code shared across examples
  7. import pylab, random, string, copy, math
  8. fName = 'E:\MIT_6_00\MIT_6_00_Assignments\problemSet10\counties.txt'
  9. ##fName = 'E:\MIT_6_00\MIT_6_00_Assignments\problemSet10\sample.txt'
  10. class Point(object):
  11.     def __init__(self, name, originalAttrs, normalizedAttrs = None):
  12.         """normalizedAttrs and originalAttrs are both arrays"""
  13.         self.name = name
  14.         self.unNormalized = originalAttrs
  15.         self.attrs = normalizedAttrs
  16.     def dimensionality(self):
  17.         return len(self.attrs)
  18.     def getAttrs(self):
  19.         return self.attrs
  20.     def getOriginalAttrs(self):
  21.         return self.unNormalized
  22.     def distance(self, other):
  23.         #Euclidean distance metric
  24.         difference = self.attrs - other.attrs
  25.         return sum(difference * difference) ** 0.5
  26.     def getName(self):
  27.         return self.name
  28.     def toStr(self):
  29.         return self.name + str(self.attrs)
  30.     def __str__(self):
  31.         return self.name
  32.  
  33. class County(Point):
  34.     weights = pylab.array([1.0] * 14)
  35.  
  36.     # Override Point.distance to use County.weights to decide the
  37.     # significance of each dimension
  38.     def distance(self, other):
  39.         difference = self.getAttrs() - other.getAttrs()
  40.         return sum(County.weights * difference * difference) ** 0.5
  41.  
  42. class Cluster(object):
  43.     def __init__(self, points, pointType):
  44.         self.points = points
  45.         self.pointType = pointType
  46.         self.centroid = self.computeCentroid()
  47.     def getCentroid(self):
  48.         return self.centroid
  49.     def computeCentroid(self):
  50.         dim = self.points[0].dimensionality()
  51.         totVals = pylab.array([0.0]*dim)
  52.         for p in self.points:
  53.             totVals += p.getAttrs()
  54.         meanPoint = self.pointType('mean',totVals/float(len(self.points)),totVals/float(len(self.points)))
  55.         return meanPoint
  56.     def update(self, points):
  57.         oldCentroid = self.centroid
  58.         self.points = points
  59.         if len(points) > 0:
  60.             self.centroid = self.computeCentroid()
  61.             return oldCentroid.distance(self.centroid)
  62.         else:
  63.             return 0.0
  64.     def getPoints(self):
  65.         return self.points
  66.     def contains(self, name):
  67.         for p in self.points:
  68.             if p.getName() == name:
  69.                 return True
  70.         return False
  71.     def toStr(self):
  72.         result = ''
  73.         for p in self.points:
  74.             result = result + p.toStr() + ', '
  75.         return result[:-2]
  76.     def __str__(self):
  77.         result = ''
  78.         for p in self.points:
  79.             result = result + str(p) + ', '
  80.         return result[:-2]
  81.  
  82.  
  83.  
  84. def kmeans(points, k, cutoff, pointType, minIters = 3, maxIters = 100, toPrint = False):
  85.     """ Returns (Cluster list, max dist of any point to its cluster) """
  86.     #Uses random initial centroids
  87.     initialCentroids = random.sample(points,k)
  88.     clusters = []
  89.     for p in initialCentroids:
  90.         clusters.append(Cluster([p], pointType))
  91.     numIters = 0
  92.     biggestChange = cutoff
  93.     while (biggestChange >= cutoff and numIters < maxIters) or numIters < minIters:
  94.         print "Starting iteration " + str(numIters)
  95.         newClusters = []
  96.         for c in clusters:
  97.             newClusters.append([])
  98.         for p in points:
  99.             smallestDistance = p.distance(clusters[0].getCentroid())
  100.             index = 0
  101.             for i in range(len(clusters)):
  102.                 distance = p.distance(clusters[i].getCentroid())
  103.                 if distance < smallestDistance:
  104.                     smallestDistance = distance
  105.                     index = i
  106.             newClusters[index].append(p)
  107.         biggestChange = 0.0
  108.         for i in range(len(clusters)):
  109.             change = clusters[i].update(newClusters[i])
  110.             #print "Cluster " + str(i) + ": " + str(len(clusters[i].points))
  111.             biggestChange = max(biggestChange, change)
  112.         numIters += 1
  113.         if toPrint:
  114.             print 'Iteration count =', numIters
  115.     maxDist = 0.0
  116.     for c in clusters:
  117.         for p in c.getPoints():
  118.             if p.distance(c.getCentroid()) > maxDist:
  119.                 maxDist = p.distance(c.getCentroid())
  120.     print 'Total Number of iterations =', numIters, 'Max Diameter =', maxDist
  121.     print biggestChange
  122.     return clusters, maxDist
  123.  
  124. #US Counties example
  125. def readCountyData(fName, numEntries = 14):
  126.     dataFile = open(fName, 'r')
  127.     dataList = []
  128.     nameList = []
  129.     maxVals = pylab.array([0.0]*numEntries)
  130.     #Build unnormalized feature vector
  131.     for line in dataFile:
  132.         if len(line) == 0 or line[0] == '#':
  133.             continue
  134.         dataLine = string.split(line)
  135.         name = dataLine[0] + dataLine[1]
  136.         features = []
  137.         #Build vector with numEntries features
  138.         for f in dataLine[2:]:
  139.             try:
  140.                 f = float(f)
  141.                 features.append(f)
  142.                 if f > maxVals[len(features)-1]:
  143.                     maxVals[len(features)-1] = f
  144.             except ValueError:
  145.                 name = name + f
  146.         if len(features) != numEntries:
  147.             continue
  148.         dataList.append(features)
  149.         nameList.append(name)
  150.     return nameList, dataList, maxVals
  151.  
  152. def buildCountyPoints(fName):
  153.     """
  154.    Given an input filename, reads County values from the file and returns
  155.    them all in a list.
  156.    """
  157.     nameList, featureList, maxVals = readCountyData(fName)
  158.     points = []
  159.     for i in range(len(nameList)):
  160.         originalAttrs = pylab.array(featureList[i])
  161.         normalizedAttrs = originalAttrs/pylab.array(maxVals)
  162.         points.append(County(nameList[i], originalAttrs, normalizedAttrs))
  163.     return points
  164.  
  165. def randomPartition(l, p):
  166.     """
  167.    Splits the input list into two partitions, where each element of l is
  168.    in the first partition with probability p and the second one with
  169.    probability (1.0 - p).
  170.  
  171.    l: The list to split
  172.    p: The probability that an element of l will be in the first partition
  173.  
  174.    Returns: a tuple of lists, containing the elements of the first and
  175.    second partitions.
  176.    """
  177.     l1 = []
  178.     l2 = []
  179.     for x in l:
  180.         if random.random() < p:
  181.             l1.append(x)
  182.         else:
  183.             l2.append(x)
  184.     return (l1,l2)
  185.  
  186. def getAveIncome(cluster):
  187.     """
  188.    Given a Cluster object, finds the average income field over the members
  189.    of that cluster.
  190.  
  191.    cluster: the Cluster object to check
  192.  
  193.    Returns: a float representing the computed average income value
  194.    """
  195.     tot = 0.0
  196.     numElems = 0
  197.     for c in cluster.getPoints():
  198.         tot += c.getOriginalAttrs()[1]
  199.  
  200.     return float(tot) / len(cluster.getPoints())
  201.  
  202.  
  203. def test(points, k = 200, cutoff = 0.1):
  204.     """
  205.    A sample function to show you how to do a simple kmeans run and graph
  206.    the results.
  207.    """
  208.     incomes = []
  209.     print ''
  210.     clusters, maxSmallest = kmeans(points, k, cutoff, County)
  211.  
  212.     for i in range(len(clusters)):
  213.         if len(clusters[i].points) == 0: continue
  214.         incomes.append(getAveIncome(clusters[i]))
  215.  
  216.     pylab.hist(incomes)
  217.     pylab.xlabel('Ave. Income')
  218.     pylab.ylabel('Number of Clusters')
  219.     pylab.show()
  220.  
  221.  
  222. points = buildCountyPoints('counties.txt')
  223. random.seed(123)
  224. testPoints = random.sample(points, len(points)/10)
  225.  
  226.  
  227.  
  228.  
  229. def graphRemovedErr(points, kvals = [25, 50, 75, 100, 125, 150], cutoff = 0.1):
  230.     """
  231.    Should produce graphs of the error in training and holdout point sets, and
  232.    the ratio of the error of the points, after clustering for the given values of k.
  233.    For details see Problem 1.
  234.    """
  235.  
  236.     trainingSet, holdoutSet = randomPartition(testPoints, .2)
  237.     trainingSetSum =[]
  238.     holdoutSetSum =[]
  239.     for vals in kvals:
  240.         errorSummation = 0.0
  241.         clusterSet,maxDist = kmeans(trainingSet, 25, cutoff,County)
  242.         for cluster in clusterSet:
  243.             centroid = cluster.computeCentroid()
  244.             p = cluster.getPoints()
  245.             for point in p:
  246.                 errorSummation += (point.distance(centroid))**2
  247.         trainingSetSum.append(errorSummation)
  248.  
  249.         distCheck = 0.0
  250.         for point in holdoutSet:
  251.             minDist = 100.0
  252.             for cluster in clusterSet:
  253.                 a = cluster.computeCentroid()
  254.                 if point.distance(a) < minDist:
  255.                     minDist = point.distance(a)
  256.                     minDistSq = (point.distance(a))**2
  257.             distCheck+= minDistSq
  258.         holdoutSetSum.append(distCheck)
  259.  
  260.     pylab.plot(kvals, trainingSetSum)
  261.     pylab.plot(kvals, holdoutSetSum)
  262.     pylab.xlabel('Values of k')
  263.     pylab.ylabel('Error values')
  264.     pylab.show()
  265.  
  266.  
  267. graphRemovedErr(points)
  268.  
  269.  
  270. def graphPredictionErr(points, dimension, kvals = [25, 50, 75, 100, 125, 150], cutoff = 0.1):
  271.     """
  272.    Given input points and a dimension to predict, should cluster on the
  273.    appropriate values of k and graph the error in the resulting predictions,
  274.    as described in Problem 3.
  275.    """
  276.  
  277.     # Your Code Here
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement