Advertisement
Guest User

clusters

a guest
Sep 26th, 2016
75
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.54 KB | None | 0 0
  1. from math import sqrt
  2.  
  3.  
  4. def readfile(filename):
  5. lines = [line for line in open(filename)]
  6. # First line is the column titles
  7. colnames = lines[0].strip().split('\t')[1:]
  8. rownames = []
  9. data = []
  10. for line in lines[1:]:
  11. p = line.strip().split('\t')
  12. # First column in each row is the rowname
  13. rownames.append(p[0])
  14. # The data for this row is the remainder of the row
  15. data.append([float(x) for x in p[1:]])
  16. # Eachaton 1544, 1482
  17. # Signal 625, 962
  18.  
  19. # Signal 1544, 1482
  20. # Eachaton 616, 956
  21. return rownames, colnames, data
  22.  
  23.  
  24. def pearson(v1, v2): # These vector values are the "p" variable values from the "readfile" function above
  25. # Simple sums
  26. sum1 = sum(v1)
  27. sum2 = sum(v2)
  28.  
  29. # Sums of the squares
  30. sum1sq = sum([pow(v, 2) for v in v1])
  31. sum2sq = sum([pow(v, 2) for v in v2])
  32.  
  33. # Sum of the products
  34. print('v1: ', len(v1), 'v2: ', len(v2))
  35. psum = sum([v1[i] * v2[i] for i in range(len(v1))])
  36.  
  37.  
  38. # Calculate r (Pearson score)
  39. num = psum-(sum1*sum2/float(len(v1)))
  40. den = sqrt((sum1sq-pow(sum1, 2)/float(len(v1))) * (sum2sq-pow(sum2, 2)/float(len(v1))))
  41. if den == 0:
  42. return 0
  43.  
  44. return 1.0-num/den
  45.  
  46.  
  47. class BiCluster:
  48. def __init__(self, vec, left=None, right=None, distance=0.0, id=None):
  49. self.left = left
  50. self.right = right
  51. self.vec = vec
  52. self.id = id
  53. self.distance = distance
  54.  
  55.  
  56. def hcluster(rows, distance=pearson):
  57. distances = {}
  58. currentclustid = -1
  59.  
  60. # Clusters are initially just the rows
  61. clust = [BiCluster(rows[i], id=i) for i in range(len(rows))]
  62.  
  63. while len(clust) > 1:
  64. lowestpair = (0, 1)
  65. closest = distance(clust[0].vec, clust[1].vec)
  66.  
  67. # Loop through every pair looking for the smallest distance
  68. for i in range(len(clust)):
  69. for j in range(i+1, len(clust)):
  70. # Distances in the cache of distance calculations
  71. if (clust[i].id, clust[j].id) not in distances:
  72.  
  73. distances[(clust[i].id, clust[j].id)] = distance(clust[i].vec, clust[j].vec)
  74. d = distances[(clust[i].id, clust[j].id)]
  75.  
  76. if d < closest:
  77. closest = d
  78. lowestpair = (i, j)
  79.  
  80. # Calculate the average of the two clusters
  81. mergevec = [(clust[lowestpair[0]].vec[i]+clust[lowestpair[1]].vec[i])/2.0 for i in range(len(clust[0].vec))]
  82.  
  83. # Create the new cluster
  84. newcluster = BiCluster(mergevec, left=clust[lowestpair[0]], right=clust[lowestpair[1]], distance=closest,
  85. id=currentclustid)
  86.  
  87. # Cluster ids that weren't in the original set are negative
  88. currentclustid -= 1
  89. del clust[lowestpair[1]]
  90. del clust[lowestpair[0]]
  91. clust.append(newcluster)
  92.  
  93. return clust[0]
  94.  
  95.  
  96. def printclust(clust, labels=None, n=0):
  97. # Indent to make a hierarchy layout
  98. for i in range(n):
  99. print('',)
  100. if clust.id < 0:
  101. # Negative id means that this is a branch
  102. print('-')
  103. else:
  104. # Positive id means that this is an endpoint
  105. if labels is None:
  106. print(clust.id)
  107. else:
  108. print(labels[clust.id])
  109.  
  110. # Now print the right and left branches
  111. if clust.left is not None:
  112. printclust(clust.left, labels=labels, n=n+1)
  113. if clust.right is not None:
  114. printclust(clust.right, labels=labels, n=n+1)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement