Advertisement
Guest User

Untitled

a guest
May 27th, 2015
255
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.67 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # Programmer : zhuxp
  3. # Date:
  4. # Last-modified: 04-16-2015, 15:08:44 EDT
  5. from __future__ import print_function
  6. VERSION="0.1"
  7. import os,sys,argparse
  8. from bam2x import TableIO,Tools
  9. from bam2x import IO
  10. import numpy as np
  11. from numpy import linalg as LA
  12. from scipy.linalg import eigh
  13. import numpy as np
  14. import itertools
  15. from bam2x.Tools import gini_coefficient as gini_index
  16. from scipy.cluster.vq import kmeans,vq
  17. from numpy import exp,min,arccos,sin,trace,compress,equal,array,sqrt,isnan,power,mean
  18. from scipy.spatial.distance import squareform,pdist,euclidean
  19. from os.path import splitext
  20. import logging
  21. def ParseArg():
  22. ''' This Function Parse the Argument '''
  23. p=argparse.ArgumentParser( description = 'Example: %(prog)s -h', epilog='Library dependency : bam2x')
  24. p.add_argument('-v','--version',action='version',version='%(prog)s '+VERSION)
  25. p.add_argument('-i','--input',dest="input",nargs="*",help="input expression matrix files")
  26. p.add_argument('-o','--output',dest="output",type=str,default="stdout",help="output file DEFAULT: STDOUT")
  27. p.add_argument('-c',dest="start_col",type=int,default=3,help="start column default:%(default)i")
  28. p.add_argument('-e','--eigen_cutoff',dest="e",type=float,default=0.95,help="eigen proportion")
  29. if len(sys.argv)==1:
  30. print(p.print_help(),file=sys.stderr)
  31. exit(0)
  32. return p.parse_args()
  33. def Main():
  34. global args,out,logger
  35. logger=logging.getLogger("akmeans logger")
  36. logger.setLevel(logging.DEBUG)
  37. logger.addHandler(logging.StreamHandler())
  38. args=ParseArg()
  39. out=IO.fopen(args.output,"w")
  40. #fname,ext=splitext(args.input)
  41. mlist=[]
  42. namelist=[]
  43. slist=[]
  44. idxlist=[]
  45. for f in args.input:
  46. logger.info("reading "+f)
  47. names,mat=read_matfile(f)
  48. mlist.append(mat)
  49. namelist.append(names)
  50. slist.append(mat_to_smat(mat))
  51. idx,_=akmeans(slist[-1],args.e)
  52. lst=idx_to_list(idx)
  53. lst.sort(key=lambda x: average_rank_score([mat[i] for i in x]),reverse=True)
  54. for i,l in enumerate(lst):
  55. for j in l:
  56. idx[j]=i
  57. idxlist.append(idx)
  58. for i in idxlist:
  59. logger.info(i)
  60. S=geometry_average(slist)
  61. idx,_=akmeans(S,args.e)
  62. lst=idx_to_list(idx)
  63. lst.sort(key=lambda x: list_average_rank_score([[mat[i] for i in x] for mat in mlist]),reverse=True)
  64. for i,l in enumerate(lst):
  65. for j in l:
  66. idx[j]=i
  67. idxlist.append(idx)
  68. logger.info(idx)
  69. idxt=array(idxlist).T
  70. print("gene\t"+"\t".join(args.input)+"\t"+"Interagrated",file=out)
  71. for i,x in enumerate(idxt):
  72. print(names[i]+"\t"+"\t".join([str(j) for j in x]),file=out);
  73.  
  74.  
  75.  
  76. '''
  77. idx,vector = akmeans(S,args.e)
  78. idxA,_ = akmeans(SA,args.e1)
  79. idxB,_ = akmeans(SB,args.e1)
  80.  
  81. listA = idx_to_list(idxA)
  82. listB = idx_to_list(idxB)
  83. listAB=idx_to_list(idx)
  84.  
  85. listA.sort(key=lambda x: average_rank_score([a[i] for i in x]),reverse=True)
  86. listB.sort(key=lambda x: average_rank_score([b[i] for i in x]),reverse=True)
  87. listAB.sort(key=lambda x: average_rank_score([[z+w for z,w in itertools.izip(a[i],b[i])] for i in x]),reverse=True)
  88.  
  89. for i,l in enumerate(listA):
  90. for j in l:
  91. idxA[j]=i
  92. for i,l in enumerate(listB):
  93. for j in l:
  94. idxB[j]=i
  95. for i,l in enumerate(listAB):
  96. for j in l:
  97. idx[j]=i
  98.  
  99. h=["gene","group_A","time_A","time_B","group_B"]+[i for i in head[2:]]+["group"];
  100. print("\t".join([str(i) for i in h]),file=out)
  101. for g,gA,gB,n,ai,bi in itertools.izip(idx,idxA,idxB,name,a,b):
  102. l=[n,gA,time_index(ai),time_index(bi),gB]+ai+bi+[g]
  103. s="\t".join([str(i) for i in l])
  104. print(s,file=out);
  105. '''
  106. def read_matfile(f,header=1,col=2): #start col 0
  107. fin=IO.fopen(f,"r")
  108. headers=[]
  109. mat=[]
  110. names=[]
  111. for i in xrange(header):
  112. headers.append(fin.next());
  113. for i in TableIO.parse(fin):
  114. names.append(i[0]);
  115. mat.append([float(a) for a in i[col:]])
  116. return names,np.array(mat)
  117. def mat_to_smat(mat,method=0):
  118. if method==0:
  119. S=exp(-(sin((arccos(np.corrcoef(mat))/2)**2)))
  120. S[isnan(S)]=0.01
  121. #TODO other method
  122. return S
  123. def geometry_average(Slist):
  124. l=len(Slist)
  125. S=power(Slist[0],1.0/l)
  126. for S0 in Slist[1:]:
  127. S*=power(S0,1.0/l)
  128. return S
  129.  
  130. def idx_to_list(idx):
  131. m=0
  132. for i in idx:
  133. if m<i: m=i
  134. l=[[] for i in xrange(m+1)]
  135. for i,x in enumerate(idx):
  136. l[x].append(i)
  137. return l
  138. def list_average_rank_score(x):
  139. s=[average_rank_score(i) for i in x]
  140. return mean(s)
  141. def time_index(x):
  142. return 1.0-rank_score(x)
  143. def average_rank_score(x):
  144. l=len(x)
  145. s=0.0
  146. for i in x:
  147. s+=rank_score(i)
  148. return s/l
  149. def rank_score(x):
  150. s=sum(x)
  151. a=[float(j)/s for j in x]
  152. l=len(x)
  153. s1=0.0
  154. s2=0.0
  155. for i in a:
  156. s1+=i
  157. s2+=s1*1.0/l
  158. return s2
  159.  
  160. def akmeans(S,ro):
  161. D=sum(S)
  162. l=len(S)
  163. print("D=",D)
  164. D1=D**-1
  165. c=D1*S
  166. tr=trace(c)
  167. value,vector=eigh(c,eigvals=(l-50,l-1))
  168. #vector=_norm_vector(vector)
  169. rank=range(0,l)
  170. print("value=",value)
  171. max_value=value[-1]
  172. k=0
  173. s=0
  174. for i in value[::-1]:
  175. s+=i
  176. if float(s)/tr<ro:
  177. k+=1
  178. else:
  179. break
  180. print(vector[:,-k-1:-1])
  181. DBI=10000;
  182. idx=None;
  183. centroids=None;
  184. for i in range(10):
  185. n_centroids,_ = kmeans(vector[:,-k-1:-1],k)
  186. n_idx,_=vq(vector[:,-k-1:-1],n_centroids)
  187. n_DBI=dbi(n_centroids,n_idx,vector[:,-k-1:-1])
  188. if n_DBI<DBI:
  189. DBI=n_DBI
  190. centroids=n_centroids
  191. idx=n_idx
  192. print("DBI=",dbi(centroids,idx,vector[:,-k-1:-1]))
  193. print("N=",len(centroids))
  194. return idx,vector[:,-k-1:-1]
  195.  
  196. def rep(x):
  197. s=sum([float(i) for i in x])
  198. return [round(j/s,2) for j in x]
  199.  
  200. def dbi(mu,idx,obs):
  201. d=dist(mu,idx,obs)
  202. return davies_bouldin_index(mu,d)
  203. def dist(mu,idx,obs):
  204. MIN_DIST=0.01
  205. dist=[0.0 for i in xrange(mu.shape[0])]
  206. for i in xrange(mu.shape[0]):
  207. j=-1
  208. a=compress(equal(idx,i),obs,0)
  209. for d0 in a:
  210. dist[i]+=np.sum((d0-mu[i])**2)
  211. j+=1
  212. if j>-1:
  213. dist[i]/=(j+1)
  214. else:
  215. dist[i]=MIN_DIST
  216. return dist
  217. def davies_bouldin_index(mu,dist):
  218. db=0.0
  219. n=len(mu)
  220. for i in xrange(n):
  221. db+=max(array([(sqrt(dist[j])+sqrt(dist[i]))/euclidean(mu[i],mu[j]) for j in xrange(n) if j!=i]))
  222. db*=1.0/n
  223. return db
  224. db=0.0
  225. n=len(mu)
  226. for i in xrange(n):
  227. db+=max(array([(sqrt(dist[j])+sqrt(dist[i]))/euclidean(mu[i],mu[j]) for j in xrange(n) if j!=i]))
  228. db*=1.0/n
  229. return db
  230.  
  231.  
  232. if __name__=="__main__":
  233. Main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement