Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from math import sqrt
- def readfile(filename):
- lines = [line for line in open(filename)]
- # First line is the column titles
- colnames = lines[0].strip().split('\t')[1:]
- rownames = []
- data = []
- for line in lines[1:]:
- p = line.strip().split('\t')
- # First column in each row is the rowname
- rownames.append(p[0])
- # The data for this row is the remainder of the row
- data.append([float(x) for x in p[1:]])
- # Eachaton 1544, 1482
- # Signal 625, 962
- # Signal 1544, 1482
- # Eachaton 616, 956
- return rownames, colnames, data
- def pearson(v1, v2): # These vector values are the "p" variable values from the "readfile" function above
- # Simple sums
- sum1 = sum(v1)
- sum2 = sum(v2)
- # Sums of the squares
- sum1sq = sum([pow(v, 2) for v in v1])
- sum2sq = sum([pow(v, 2) for v in v2])
- # Sum of the products
- print('v1: ', len(v1), 'v2: ', len(v2))
- psum = sum([v1[i] * v2[i] for i in range(len(v1))])
- # Calculate r (Pearson score)
- num = psum-(sum1*sum2/float(len(v1)))
- den = sqrt((sum1sq-pow(sum1, 2)/float(len(v1))) * (sum2sq-pow(sum2, 2)/float(len(v1))))
- if den == 0:
- return 0
- return 1.0-num/den
- class BiCluster:
- def __init__(self, vec, left=None, right=None, distance=0.0, id=None):
- self.left = left
- self.right = right
- self.vec = vec
- self.id = id
- self.distance = distance
- def hcluster(rows, distance=pearson):
- distances = {}
- currentclustid = -1
- # Clusters are initially just the rows
- clust = [BiCluster(rows[i], id=i) for i in range(len(rows))]
- while len(clust) > 1:
- lowestpair = (0, 1)
- closest = distance(clust[0].vec, clust[1].vec)
- # Loop through every pair looking for the smallest distance
- for i in range(len(clust)):
- for j in range(i+1, len(clust)):
- # Distances in the cache of distance calculations
- if (clust[i].id, clust[j].id) not in distances:
- distances[(clust[i].id, clust[j].id)] = distance(clust[i].vec, clust[j].vec)
- d = distances[(clust[i].id, clust[j].id)]
- if d < closest:
- closest = d
- lowestpair = (i, j)
- # Calculate the average of the two clusters
- mergevec = [(clust[lowestpair[0]].vec[i]+clust[lowestpair[1]].vec[i])/2.0 for i in range(len(clust[0].vec))]
- # Create the new cluster
- newcluster = BiCluster(mergevec, left=clust[lowestpair[0]], right=clust[lowestpair[1]], distance=closest,
- id=currentclustid)
- # Cluster ids that weren't in the original set are negative
- currentclustid -= 1
- del clust[lowestpair[1]]
- del clust[lowestpair[0]]
- clust.append(newcluster)
- return clust[0]
- def printclust(clust, labels=None, n=0):
- # Indent to make a hierarchy layout
- for i in range(n):
- print('',)
- if clust.id < 0:
- # Negative id means that this is a branch
- print('-')
- else:
- # Positive id means that this is an endpoint
- if labels is None:
- print(clust.id)
- else:
- print(labels[clust.id])
- # Now print the right and left branches
- if clust.left is not None:
- printclust(clust.left, labels=labels, n=n+1)
- if clust.right is not None:
- printclust(clust.right, labels=labels, n=n+1)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement