clusters

from math import sqrt


def readfile(filename):
    lines = [line for line in open(filename)]
    # First line is the column titles
    colnames = lines[0].strip().split('\t')[1:]
    rownames = []
    data = []
    for line in lines[1:]:
        p = line.strip().split('\t')
        # First column in each row is the rowname
        rownames.append(p[0])
        # The data for this row is the remainder of the row
        data.append([float(x) for x in p[1:]])
        # Eachaton 1544, 1482
        # Signal 625, 962

        # Signal 1544, 1482
        # Eachaton 616, 956
    return rownames, colnames, data


def pearson(v1, v2):  # These vector values are the "p" variable values from the "readfile" function above
    # Simple sums
    sum1 = sum(v1)
    sum2 = sum(v2)

    # Sums of the squares
    sum1sq = sum([pow(v, 2) for v in v1])
    sum2sq = sum([pow(v, 2) for v in v2])

    # Sum of the products
    print('v1: ', len(v1), 'v2: ', len(v2))
    psum = sum([v1[i] * v2[i] for i in range(len(v1))])


    # Calculate r (Pearson score)
    num = psum-(sum1*sum2/float(len(v1)))
    den = sqrt((sum1sq-pow(sum1, 2)/float(len(v1))) * (sum2sq-pow(sum2, 2)/float(len(v1))))
    if den == 0:
        return 0

    return 1.0-num/den


class BiCluster:
    def __init__(self, vec, left=None, right=None, distance=0.0, id=None):
        self.left = left
        self.right = right
        self.vec = vec
        self.id = id
        self.distance = distance


def hcluster(rows, distance=pearson):
    distances = {}
    currentclustid = -1

    # Clusters are initially just the rows
    clust = [BiCluster(rows[i], id=i) for i in range(len(rows))]

    while len(clust) > 1:
        lowestpair = (0, 1)
        closest = distance(clust[0].vec, clust[1].vec)

        # Loop through every pair looking for the smallest distance
        for i in range(len(clust)):
            for j in range(i+1, len(clust)):
                # Distances in the cache of distance calculations
                if (clust[i].id, clust[j].id) not in distances:

                    distances[(clust[i].id, clust[j].id)] = distance(clust[i].vec, clust[j].vec)
                    d = distances[(clust[i].id, clust[j].id)]

                    if d < closest:
                        closest = d
                        lowestpair = (i, j)

        # Calculate the average of the two clusters
        mergevec = [(clust[lowestpair[0]].vec[i]+clust[lowestpair[1]].vec[i])/2.0 for i in range(len(clust[0].vec))]

        # Create the new cluster
        newcluster = BiCluster(mergevec, left=clust[lowestpair[0]], right=clust[lowestpair[1]], distance=closest,
                               id=currentclustid)

        # Cluster ids that weren't in the original set are negative
        currentclustid -= 1
        del clust[lowestpair[1]]
        del clust[lowestpair[0]]
        clust.append(newcluster)

    return clust[0]


def printclust(clust, labels=None, n=0):
    # Indent to make a hierarchy layout
    for i in range(n):
        print('',)
    if clust.id < 0:
        # Negative id means that this is a branch
        print('-')
    else:
        # Positive id means that this is an endpoint
        if labels is None:
            print(clust.id)
        else:
            print(labels[clust.id])

    # Now print the right and left branches
    if clust.left is not None:
        printclust(clust.left, labels=labels, n=n+1)
    if clust.right is not None:
        printclust(clust.right, labels=labels, n=n+1)