Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # Programmer : zhuxp
- # Date:
- # Last-modified: 04-16-2015, 15:08:44 EDT
- from __future__ import print_function
- VERSION="0.1"
- import os,sys,argparse
- from bam2x import TableIO,Tools
- from bam2x import IO
- import numpy as np
- from numpy import linalg as LA
- from scipy.linalg import eigh
- import numpy as np
- import itertools
- from bam2x.Tools import gini_coefficient as gini_index
- from scipy.cluster.vq import kmeans,vq
- from numpy import exp,min,arccos,sin,trace,compress,equal,array,sqrt,isnan,power,mean
- from scipy.spatial.distance import squareform,pdist,euclidean
- from os.path import splitext
- import logging
- def ParseArg():
- ''' This Function Parse the Argument '''
- p=argparse.ArgumentParser( description = 'Example: %(prog)s -h', epilog='Library dependency : bam2x')
- p.add_argument('-v','--version',action='version',version='%(prog)s '+VERSION)
- p.add_argument('-i','--input',dest="input",nargs="*",help="input expression matrix files")
- p.add_argument('-o','--output',dest="output",type=str,default="stdout",help="output file DEFAULT: STDOUT")
- p.add_argument('-c',dest="start_col",type=int,default=3,help="start column default:%(default)i")
- p.add_argument('-e','--eigen_cutoff',dest="e",type=float,default=0.95,help="eigen proportion")
- if len(sys.argv)==1:
- print(p.print_help(),file=sys.stderr)
- exit(0)
- return p.parse_args()
- def Main():
- global args,out,logger
- logger=logging.getLogger("akmeans logger")
- logger.setLevel(logging.DEBUG)
- logger.addHandler(logging.StreamHandler())
- args=ParseArg()
- out=IO.fopen(args.output,"w")
- #fname,ext=splitext(args.input)
- mlist=[]
- namelist=[]
- slist=[]
- idxlist=[]
- for f in args.input:
- logger.info("reading "+f)
- names,mat=read_matfile(f)
- mlist.append(mat)
- namelist.append(names)
- slist.append(mat_to_smat(mat))
- idx,_=akmeans(slist[-1],args.e)
- lst=idx_to_list(idx)
- lst.sort(key=lambda x: average_rank_score([mat[i] for i in x]),reverse=True)
- for i,l in enumerate(lst):
- for j in l:
- idx[j]=i
- idxlist.append(idx)
- for i in idxlist:
- logger.info(i)
- S=geometry_average(slist)
- idx,_=akmeans(S,args.e)
- lst=idx_to_list(idx)
- lst.sort(key=lambda x: list_average_rank_score([[mat[i] for i in x] for mat in mlist]),reverse=True)
- for i,l in enumerate(lst):
- for j in l:
- idx[j]=i
- idxlist.append(idx)
- logger.info(idx)
- idxt=array(idxlist).T
- print("gene\t"+"\t".join(args.input)+"\t"+"Interagrated",file=out)
- for i,x in enumerate(idxt):
- print(names[i]+"\t"+"\t".join([str(j) for j in x]),file=out);
- '''
- idx,vector = akmeans(S,args.e)
- idxA,_ = akmeans(SA,args.e1)
- idxB,_ = akmeans(SB,args.e1)
- listA = idx_to_list(idxA)
- listB = idx_to_list(idxB)
- listAB=idx_to_list(idx)
- listA.sort(key=lambda x: average_rank_score([a[i] for i in x]),reverse=True)
- listB.sort(key=lambda x: average_rank_score([b[i] for i in x]),reverse=True)
- listAB.sort(key=lambda x: average_rank_score([[z+w for z,w in itertools.izip(a[i],b[i])] for i in x]),reverse=True)
- for i,l in enumerate(listA):
- for j in l:
- idxA[j]=i
- for i,l in enumerate(listB):
- for j in l:
- idxB[j]=i
- for i,l in enumerate(listAB):
- for j in l:
- idx[j]=i
- h=["gene","group_A","time_A","time_B","group_B"]+[i for i in head[2:]]+["group"];
- print("\t".join([str(i) for i in h]),file=out)
- for g,gA,gB,n,ai,bi in itertools.izip(idx,idxA,idxB,name,a,b):
- l=[n,gA,time_index(ai),time_index(bi),gB]+ai+bi+[g]
- s="\t".join([str(i) for i in l])
- print(s,file=out);
- '''
- def read_matfile(f,header=1,col=2): #start col 0
- fin=IO.fopen(f,"r")
- headers=[]
- mat=[]
- names=[]
- for i in xrange(header):
- headers.append(fin.next());
- for i in TableIO.parse(fin):
- names.append(i[0]);
- mat.append([float(a) for a in i[col:]])
- return names,np.array(mat)
- def mat_to_smat(mat,method=0):
- if method==0:
- S=exp(-(sin((arccos(np.corrcoef(mat))/2)**2)))
- S[isnan(S)]=0.01
- #TODO other method
- return S
- def geometry_average(Slist):
- l=len(Slist)
- S=power(Slist[0],1.0/l)
- for S0 in Slist[1:]:
- S*=power(S0,1.0/l)
- return S
- def idx_to_list(idx):
- m=0
- for i in idx:
- if m<i: m=i
- l=[[] for i in xrange(m+1)]
- for i,x in enumerate(idx):
- l[x].append(i)
- return l
- def list_average_rank_score(x):
- s=[average_rank_score(i) for i in x]
- return mean(s)
- def time_index(x):
- return 1.0-rank_score(x)
- def average_rank_score(x):
- l=len(x)
- s=0.0
- for i in x:
- s+=rank_score(i)
- return s/l
- def rank_score(x):
- s=sum(x)
- a=[float(j)/s for j in x]
- l=len(x)
- s1=0.0
- s2=0.0
- for i in a:
- s1+=i
- s2+=s1*1.0/l
- return s2
- def akmeans(S,ro):
- D=sum(S)
- l=len(S)
- print("D=",D)
- D1=D**-1
- c=D1*S
- tr=trace(c)
- value,vector=eigh(c,eigvals=(l-50,l-1))
- #vector=_norm_vector(vector)
- rank=range(0,l)
- print("value=",value)
- max_value=value[-1]
- k=0
- s=0
- for i in value[::-1]:
- s+=i
- if float(s)/tr<ro:
- k+=1
- else:
- break
- print(vector[:,-k-1:-1])
- DBI=10000;
- idx=None;
- centroids=None;
- for i in range(10):
- n_centroids,_ = kmeans(vector[:,-k-1:-1],k)
- n_idx,_=vq(vector[:,-k-1:-1],n_centroids)
- n_DBI=dbi(n_centroids,n_idx,vector[:,-k-1:-1])
- if n_DBI<DBI:
- DBI=n_DBI
- centroids=n_centroids
- idx=n_idx
- print("DBI=",dbi(centroids,idx,vector[:,-k-1:-1]))
- print("N=",len(centroids))
- return idx,vector[:,-k-1:-1]
- def rep(x):
- s=sum([float(i) for i in x])
- return [round(j/s,2) for j in x]
- def dbi(mu,idx,obs):
- d=dist(mu,idx,obs)
- return davies_bouldin_index(mu,d)
- def dist(mu,idx,obs):
- MIN_DIST=0.01
- dist=[0.0 for i in xrange(mu.shape[0])]
- for i in xrange(mu.shape[0]):
- j=-1
- a=compress(equal(idx,i),obs,0)
- for d0 in a:
- dist[i]+=np.sum((d0-mu[i])**2)
- j+=1
- if j>-1:
- dist[i]/=(j+1)
- else:
- dist[i]=MIN_DIST
- return dist
- def davies_bouldin_index(mu,dist):
- db=0.0
- n=len(mu)
- for i in xrange(n):
- db+=max(array([(sqrt(dist[j])+sqrt(dist[i]))/euclidean(mu[i],mu[j]) for j in xrange(n) if j!=i]))
- db*=1.0/n
- return db
- db=0.0
- n=len(mu)
- for i in xrange(n):
- db+=max(array([(sqrt(dist[j])+sqrt(dist[i]))/euclidean(mu[i],mu[j]) for j in xrange(n) if j!=i]))
- db*=1.0/n
- return db
- if __name__=="__main__":
- Main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement