Guest User

Untitled

a guest
Jan 16th, 2018
76
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.20 KB | None | 0 0
  1. import numpy as np
  2. import pandas as pd
  3. import pickle
  4. import gzip
  5.  
  6.  
  7. # in this example tanh normalization is used
  8. # fold 0 is used for testing and fold 1 for validation (hyperparamter selection)
  9. norm = 'tanh'
  10. test_fold = 0
  11. val_fold = 1
  12.  
  13. def normalize(X, means1=None, std1=None, means2=None, std2=None, feat_filt=None, norm='tanh_norm'):
  14. if std1 is None:
  15. std1 = np.nanstd(X, axis=0)
  16. if feat_filt is None:
  17. feat_filt = std1!=0
  18. X = X[:,feat_filt]
  19. X = np.ascontiguousarray(X)
  20. if means1 is None:
  21. means1 = np.mean(X, axis=0)
  22. X = (X-means1)/std1[feat_filt]
  23. if norm == 'norm':
  24. return(X, means1, std1, feat_filt)
  25. elif norm == 'tanh':
  26. return(np.tanh(X), means1, std1, feat_filt)
  27. elif norm == 'tanh_norm':
  28. X = np.tanh(X)
  29. if means2 is None:
  30. means2 = np.mean(X, axis=0)
  31. if std2 is None:
  32. std2 = np.std(X, axis=0)
  33. X = (X-means2)/std2
  34. X[:,std2==0]=0
  35. return(X, means1, std1, means2, std2, feat_filt)
  36.  
  37. #contains the data in both feature ordering ways (drug A - drug B - cell line and drug B - drug A - cell line)
  38. #in the first half of the data the features are ordered (drug A - drug B - cell line)
  39. #in the second half of the data the features are ordered (drug B - drug A - cell line)
  40. file = gzip.open('X.p.gz', 'rb')
  41. X = pickle.load(file)
  42. file.close()
  43.  
  44.  
  45.  
  46. #contains synergy values and fold split (numbers 0-4)
  47. labels = pd.read_csv('labels.csv', index_col=0)
  48. #labels are duplicated for the two different ways of ordering in the data
  49. labels = pd.concat([labels, labels])
  50.  
  51.  
  52.  
  53. #indices of training data for hyperparameter selection: fold 2, 3, 4
  54. idx_tr = np.where(np.logical_and(labels['fold']!=test_fold, labels['fold']!=val_fold))
  55. #indices of validation data for hyperparameter selection: fold 1
  56. idx_val = np.where(labels['fold']==val_fold)
  57.  
  58. #indices of training data for model testing: fold 1, 2, 3, 4
  59. idx_train = np.where(labels['fold']!=test_fold)
  60. #indices of test data for model testing: fold 0
  61. idx_test = np.where(labels['fold']==test_fold)
  62.  
  63.  
  64.  
  65. X_tr = X[idx_tr]
  66. X_val = X[idx_val]
  67. X_train = X[idx_train]
  68. X_test = X[idx_test]
  69.  
  70. y_tr = labels.iloc[idx_tr]['synergy'].values
  71. y_val = labels.iloc[idx_val]['synergy'].values
  72. y_train = labels.iloc[idx_train]['synergy'].values
  73. y_test = labels.iloc[idx_test]['synergy'].values
  74.  
  75.  
  76. if norm == "tanh_norm":
  77. X_tr, mean, std, mean2, std2, feat_filt = normalize(X_tr, norm=norm)
  78. X_val, mean, std, mean2, std2, feat_filt = normalize(X_val, mean, std, mean2, std2,
  79. feat_filt=feat_filt, norm=norm)
  80. else:
  81. X_tr, mean, std, feat_filt = normalize(X_tr, norm=norm)
  82. X_val, mean, std, feat_filt = normalize(X_val, mean, std, feat_filt=feat_filt, norm=norm)
  83.  
  84.  
  85. if norm == "tanh_norm":
  86. X_train, mean, std, mean2, std2, feat_filt = normalize(X_train, norm=norm)
  87. X_test, mean, std, mean2, std2, feat_filt = normalize(X_test, mean, std, mean2, std2,
  88. feat_filt=feat_filt, norm=norm)
  89. else:
  90. X_train, mean, std, feat_filt = normalize(X_train, norm=norm)
  91. X_test, mean, std, feat_filt = normalize(X_test, mean, std, feat_filt=feat_filt, norm=norm)
  92.  
  93. pickle.dump((X_tr, X_val, X_train, X_test, y_tr, y_val, y_train, y_test), open('data_test_fold%d_%s.p'%(test_fold, norm), 'wb'))
Add Comment
Please, Sign In to add comment