Advertisement
PikalaxALT

diffex-0.2

Feb 1st, 2015
383
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 9.86 KB | None | 0 0
  1. ### VERSION 0.2 created Jan 26 2015
  2.  
  3. from __future__ import division
  4. import sys
  5. import matplotlib.pyplot as plt
  6. import numpy as np
  7. import argparse
  8. from sklearn.cluster import KMeans as kmeans
  9. from pylab import savefig
  10. from scipy.optimize import minimize as fminsearch
  11. from sklearn.linear_model import Ridge
  12.  
  13. def funRidge(x,X,D,p_train,p_test):
  14.     R = Ridge(alpha=x)
  15.     R.fit(D[p_train,:], X[p_train])
  16.     return np.linalg.norm(X[p_test] - np.asmatrix(R.predict(D[p_test,:])))
  17.  
  18. def diffex_FormatData(lfile, O, n, chrs=None, metric='mean'):
  19.     M = []
  20.     with open(lfile) as F:
  21.         for line in F:
  22.             if len(line)>1: # Use only lines with nontrivial text.
  23.                 line = line.split('\n')[0].split('\r')[0] # Remove EOL characters
  24.                 parse = line.split('\t')
  25.                 tmp = buildColumn(parse[0], chrs, O, n, metric)
  26.                 tmp = tmp.transpose().astype(np.float64).tolist()[0]
  27.                 M.append(tmp)
  28.     M = np.matrix(M).transpose()
  29.     return M
  30.    
  31. def make_xval_partition(n,n_folds):
  32.     K = np.random.permutation(n)%n_folds
  33.     return K
  34.    
  35. def buildColumn(fname, chrs, O, n, metric):
  36.     print(fname)
  37.     col = np.zeros((n,len(metric)))
  38.     with open(fname) as F:
  39.         for line in F:
  40.             if line[:3] == 'chr':
  41.                 line = line.split('\n')[0].split('\r')[0] # Remove EOL characters
  42.                 tmp = line.split('\t')
  43.                 if tmp[0] in chrs:
  44.                     for i,row in O[tmp[0]]:
  45.                         if int(row[0]) > int(tmp[1]):
  46.                             pass
  47.                         elif int(row[1]) >= int(tmp[2]):
  48.                             nmin = max(int(row[0]),int(tmp[1]))
  49.                             nmax = min(int(row[1]),int(tmp[2]))
  50.                             for j,M in enumerate(metric):
  51.                                 if M=='mean':
  52.                                     col[i,j] += (nmax-nmin+1)*float(tmp[3])/(int(row[1])-int(row[0])+1)
  53.                                 elif M=='max':
  54.                                     col[i,j] = max(col[i,j],float(tmp[3]))
  55.                                 else:
  56.                                     raise ValueError('Invalid metric: %s' % metric)
  57.                         else:
  58.                             pass
  59.                        
  60.     return col
  61.        
  62.        
  63. def getPCs(P, numPCs = -1):
  64.     C = np.cov(P.transpose())
  65.     a,v = np.linalg.eig(C)
  66.     D = (P - np.mean(P,0))*v[:,:numPCs]
  67.     return D,v
  68.  
  69.  
  70.  
  71.  
  72. parser = argparse.ArgumentParser(description='DiffEx 0.2 (C) 2015 Scott Norton (Univ of Pennsylvania School of Medicine, Dept of Biomedical Graduate Studies, Graduate Group in Genomics and Computational Biology), Lyle Ungar, Ph.D (Univ of Pennsylvania, Department of Computer and Information Sciences)',prog='DIFFEX')
  73. parser.add_argument('-1',dest='l1file',help='list of reference file paths for cell type 1')
  74. parser.add_argument('-2',dest='l2file',help='list of reference file paths for cell type 2')
  75. parser.add_argument('-e',dest='expfile',help='path to differential expression file')
  76. parser.add_argument('-m','--metric',dest='metric',type=str,default='mean',help='metric to use for computing the data matrix.  Options: mean, max.  Comma-separate multiple metrics.')
  77. parser.add_argument('-c','--chrs',default=None,dest='chrs',help='list of chromosomes to use, comma-separated i.e. "chr1,chr3,chr7_random".  Default: the whole genome')
  78. parser.add_argument('-o','--out',default='./DiffEx.out',dest='ofile',help='output file path.  Default: DiffEx.out')
  79. parser.add_argument('-k',default=3,dest='k',help='k to use for k-means.  Default: 3',type=int)
  80. parser.add_argument('-s','--save-matrix',action='store_true',dest='saveMatrix',help='choose whether to save the intermediate data matrices.  Ignored for any file loaded using --restore and --restore_path.  Default: False')
  81. parser.add_argument('-r','--restore',dest='restore',default=0,type=int,help='choose the restore point: 0 = start from beginning; 1 = restore matrix; 2 = restore princpal component scores.  1 and 2 require that the --restore_path option be set.  Default: 0.')
  82. parser.add_argument('-p','--restore-path',dest='rfile',default='',help='path to restore file.  Default: same as --out')
  83. parser.add_argument('-n','--num-pcs',dest='numPCs',type=int,default=-1,help='number of principal components to use.  Default: all of them')
  84. parser.add_argument('-t','--threshold',dest='thresh',type=np.float64,default=1,help='threshold for expression data.  Default: 1')
  85. parser.add_argument('-g','--genome',dest='genome',type=str,default='./mm9',help='Path to genome data folder.  Default: ./mm9')
  86. parser.add_argument('-f','--feature',dest='feature',type=str,default='isol',help='Which features to send to PCA.  Options: isol (default), diff')
  87. parsed = parser.parse_args()
  88.  
  89. if parsed.l1file is None or parsed.l2file is None or parsed.expfile is None:
  90.     parser.print_help()
  91.     exit()
  92. if parsed.rfile=='':
  93.     parsed.rfile = parsed.ofile
  94.  
  95. print('Loading transcript coordinates and padding by 2kb...')
  96. geneList = []
  97. geneData = []
  98. if parsed.chrs != None:
  99.     parsed.chrs = parsed.chrs.split(',')
  100.     empty=False
  101. else:
  102.     parsed.chrs = []
  103.     empty=True
  104. with open(parsed.genome+'/refGene.bed') as C:
  105.     for line in C:
  106.         n=1
  107.         if len(line)>1: # Use only lines with nontrivial text.
  108.             line = line.split('\n')[0].split('\r')[0] # Remove EOL characters
  109.             tmp = line.split('\t')
  110.             if empty or tmp[0] in parsed.chrs:
  111.                 tmp[1] = int(tmp[1])-2000
  112.                 tmp[2] = int(tmp[2])+2000
  113.                 geneList.append(tmp[:4])
  114. geneList = np.array(geneList)
  115. O = {}
  116. for i,row in enumerate(geneList):
  117.     if row[0] in O.keys():
  118.         O[row[0]].append((i, row[1:]))
  119.     else:
  120.         O[row[0]] = [(i, row[1:])]
  121. n = sum([len(O[key]) for key in O.keys()])
  122. dtype = np.dtype("i8,i8,|S16")
  123. for key in O.keys():
  124.     O[key].sort()
  125.  
  126.  
  127. if parsed.restore>=1:
  128.     print('Restoring saved data matrix...')
  129.     M = np.asmatrix(np.genfromtxt(parsed.rfile+'.data.txt', delimiter='\t'))
  130.     X = np.asmatrix(np.genfromtxt(parsed.rfile+'.expression.txt', delimiter='\t')).transpose()
  131.     geneList = np.genfromtxt(parsed.rfile+'.key.txt', delimiter='\t', dtype=str)
  132.     if parsed.restore>=2:
  133.         print('Restoring saved principal component scores...')
  134.         D = np.asmatrix(np.genfromtxt(parsed.rfile+'.pcs.txt', delimiter='\t'))
  135.         V = np.asmatrix(np.genfromtxt(parsed.rfile+'.pcv.txt', delimiter='\t'))
  136.     else:
  137.         print('Running principal component analysis...')
  138.         D,V = getPCs(M, parsed.numPCs)
  139.         if parsed.saveMatrix:
  140.             print('Saving principal component scores...')
  141.             np.savetxt(parsed.ofile+'.pcs.txt', D, delimiter='\t', fmt='%.6f')
  142.             np.savetxt(parsed.ofile+'.pcv.txt', V, delimiter='\t')
  143. else:
  144.     print('Loading differential expression data...')
  145.     parsed.metric = parsed.metric.split(',')
  146.     X = buildColumn(parsed.expfile,parsed.chrs,O,n,parsed.metric)
  147.     print('Loading and restructuring epigenomic data from cell type 1...')
  148.     M1 = diffex_FormatData(parsed.l1file, O, n, parsed.chrs, parsed.metric)
  149.     print('Loading and restructuring epigenomic data from cell type 2...')
  150.     M2 = diffex_FormatData(parsed.l2file, O, n, parsed.chrs, parsed.metric)
  151.     if parsed.feature=='isol':
  152.         M = np.concatenate((M1, M2), axis=1)
  153.     elif parsed.feature=='diff':
  154.         M = M1 - M2
  155.     print('Removing all-zero and infinite entries...')
  156.     LP = np.concatenate((M,X),axis=1)
  157.     X = np.asarray([X[k,:].tolist()[0] for k,x in enumerate(LP) if not (np.linalg.norm(x) == 0 or np.isinf(np.linalg.norm(x)) or np.iscomplex(np.linalg.norm(x)) or np.isnan(np.linalg.norm(x)))])
  158.     M = np.asmatrix([M[k,:].tolist()[0] for k,x in enumerate(LP) if not (np.linalg.norm(x) == 0 or np.isinf(np.linalg.norm(x)) or np.iscomplex(np.linalg.norm(x)) or np.isnan(np.linalg.norm(x)))])
  159.     geneList = np.asarray([geneList[k,:].tolist() for k,x in enumerate(LP) if not (np.linalg.norm(x) == 0 or np.isinf(np.linalg.norm(x)) or np.iscomplex(np.linalg.norm(x)) or np.isnan(np.linalg.norm(x)))])
  160.     if parsed.saveMatrix:
  161.         print('Saving data matrix...')
  162.         np.savetxt(parsed.ofile+'.data.txt', M, delimiter='\t', fmt='%.3f')
  163.         np.savetxt(parsed.ofile+'.expression.txt', X, delimiter='\n', fmt='%.6f')
  164.         np.savetxt(parsed.ofile+'.key.txt', geneList, delimiter='\t', fmt='%s')
  165.     print('Running principal component analysis...')
  166.     D,V = getPCs(M, parsed.numPCs)
  167.     if parsed.saveMatrix:
  168.         np.savetxt(parsed.ofile+'.pcs.txt', D, delimiter='\t', fmt='%.6f')
  169. X = np.asmatrix(X)
  170. if len(X) != X.size:
  171.     X = X.transpose()
  172. N = len(X)
  173.  
  174. print('Ridge regression on PCA data...')
  175. D = D/np.linalg.norm(D, axis=0)
  176. # D = np.concatenate((np.ones((len(D),1)), D), axis=1)
  177. P = parsed.numPCs
  178. # Use 5-fold cross-validation with gradient descent to choose lambda
  179. part = make_xval_partition(N,5)
  180. lambdas = []
  181. for i in range(5):
  182.     print('\tPartition %d...' % (i+1))
  183.     p_train = [j for j,k in enumerate(part) if k != i]
  184.     p_test = [j for j,k in enumerate(part) if k == i]
  185.     fun_ridge = lambda x: funRidge(x,X,D,p_train,p_test)
  186.     RES = fminsearch(fun_ridge, x0=0)
  187.     lambdas.append(abs(RES.x))
  188. l = np.mean(lambdas)
  189. print('Optimal regularization parameter: %.8f' % l)
  190. R = Ridge(alpha=l)
  191. R.fit(D,X)
  192. w = np.asmatrix(R.coef_).transpose()
  193. resids = X - np.asmatrix(R.predict(D))
  194. trn_err = np.linalg.norm(resids)/np.sqrt(N)
  195. print('Training error: %f' % trn_err)
  196. if parsed.saveMatrix:
  197.     np.savetxt(parsed.ofile+'.ridge.txt', w, delimiter='\n', fmt='%.8f')
  198.     np.savetxt(parsed.ofile+'.resid.txt', resids, delimiter='\t', fmt='%.8f')
  199.    
  200. print('K-means clustering using %d clusters...' % parsed.k)
  201. KM = kmeans(n_clusters=parsed.k)
  202. labels = KM.fit_predict(M)
  203. labels = np.array(labels, dtype=int)
  204. centroids = np.zeros((parsed.k,M.size/len(M)))
  205. E = np.zeros((parsed.k,3))
  206. F = np.zeros((parsed.k,2))
  207. print('Computing cluster composition...')
  208. for i in range(parsed.k):
  209.     D_tmp = np.asmatrix([v.tolist()[0] for k,v in enumerate(M) if int(labels[k]) == i])
  210.     centroids[i,:] = np.mean(D_tmp,0)
  211.     X_tmp = [int(np.sign(v)*(np.abs(v)>parsed.thresh)) for k,v in enumerate(X) if int(labels[k]) == i]
  212.     Bot = len(X_tmp)
  213.     if Bot>0:
  214.         for j in range(3):
  215.             Top = len([v for v in X_tmp if v == j-1])
  216.             E[i,j] = float(Top)/float(Bot)
  217.     F[i,0] = np.mean(X_tmp)
  218.     F[i,1] = float(Bot)/float(len(labels))
  219. print(np.concatenate((E,F),axis=1))
  220. if parsed.saveMatrix:
  221.     np.savetxt(parsed.ofile+'.distrib.txt', E, delimiter='\t', fmt='%.6f')
  222.     np.savetxt(parsed.ofile+'.centroids.txt', centroids, delimiter='\t', fmt='%.6f')
  223.     np.savetxt(parsed.ofile+'.labels.txt', labels, delimiter='\t', fmt='%d')
  224.     np.savetxt(parsed.ofile+'.means.txt', F, delimiter='\t', fmt='%d')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement