Guest User

Untitled

a guest
Aug 19th, 2018
80
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.02 KB | None | 0 0
  1. """
  2. Benchmark script to bench R's gbm package via rpy2.
  3.  
  4. NOTE::
  5.  
  6. make sure you run
  7. $ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib64/R/lib
  8.  
  9. """
  10.  
  11. import numpy as np
  12. import rpy2
  13.  
  14. from time import time
  15.  
  16. from sklearn import datasets
  17. from sklearn.utils import shuffle
  18. from sklearn.utils import check_random_state
  19.  
  20. from rpy2.robjects.numpy2ri import numpy2ri
  21. from rpy2.robjects.packages import importr
  22.  
  23.  
  24. import pylab as pl
  25.  
  26. gbm = importr('gbm')
  27.  
  28. def repeat(f):
  29. def wrapper(*args, **kargs):
  30. scores = []
  31. for i in range(10):
  32. scores.append(f(*args, random_state=i, **kargs))
  33. scores = np.array(scores)
  34. return scores.mean(axis=0), scores.std(axis=0)
  35. return wrapper
  36.  
  37.  
  38. # ignore overflows due to exp
  39. #np.seterr(invalid='print', under='print', divide='print', over='ignore')
  40.  
  41. classification_params = {"distribution": "bernoulli", "shrinkage": 1.0,
  42. "n.tree": 500, "bag.fraction": 0.5, "verbose": False,
  43. "n.minobsinnode": 1, "interaction.depth": 1}
  44.  
  45. @repeat
  46. def bench_random_gaussian(random_state=None):
  47. rs = check_random_state(random_state)
  48. shape = (12000, 10)
  49. X = rs.normal(size=shape).reshape(shape)
  50. y = ((X ** 2.0).sum(axis=1) > 9.34).astype(np.float64)
  51.  
  52. X_train, X_test = X[:2000], X[2000:]
  53. y_train, y_test = y[:2000], y[2000:]
  54.  
  55. X_train = numpy2ri(X_train)
  56. X_test = numpy2ri(X_test)
  57. y_train = numpy2ri(y_train)
  58.  
  59. model = gbm.gbm_fit(X_train, y_train, **classification_params)
  60. pred = gbm.predict_gbm(model, X_test,
  61. **{"n.tree":classification_params["n.tree"]})
  62. pred = (np.array(pred) >= 0.0).astype(np.float64)
  63. error_rate = np.mean(pred != y_test)
  64. return error_rate
  65.  
  66.  
  67. @repeat
  68. def bench_spam(random_state=None):
  69. X = np.loadtxt("/home/pprett/corpora/spam/spambase.data", delimiter=",")
  70. y = X[:, -1].ravel()
  71. X = X[:, :-1]
  72. f = open("/home/pprett/corpora/spam/spambase.names")
  73. feature_names = np.array([l.split(":")[0] for l in f])
  74.  
  75. X, y = shuffle(X, y, random_state=random_state)
  76. X_test, y_test = X[:1536], y[:1536]
  77. X_train, y_train = X[1536:], y[1536:]
  78.  
  79. y_train[y_train == -1.0] = 0
  80. y_test[y_test == -1.0] = 0
  81.  
  82. X_train = numpy2ri(X_train)
  83. X_test = numpy2ri(X_test)
  84. y_train = numpy2ri(y_train)
  85.  
  86. model = gbm.gbm_fit(X_train, y_train, **classification_params)
  87. pred = gbm.predict_gbm(model, X_test,
  88. **{"n.tree":classification_params["n.tree"]})
  89. pred = (np.array(pred) >= 0.0).astype(np.float64)
  90. error_rate = np.mean(pred != y_test)
  91. return error_rate
  92.  
  93.  
  94. def bench_madelon():
  95. X_train = np.loadtxt("/home/pprett/corpora/madelon/madelon_train.data")
  96. y_train = np.loadtxt("/home/pprett/corpora/madelon/madelon_train.labels")
  97. X_test = np.loadtxt("/home/pprett/corpora/madelon/madelon_valid.data")
  98. y_test = np.loadtxt("/home/pprett/corpora/madelon/madelon_valid.labels")
  99.  
  100. y_train[y_train == -1] = 0
  101. y_test[y_test == -1] = 0
  102.  
  103. X_train = numpy2ri(X_train)
  104. X_test = numpy2ri(X_test)
  105. y_train = numpy2ri(y_train)
  106.  
  107. model = gbm.gbm_fit(X_train, y_train, **classification_params)
  108. pred = gbm.predict_gbm(model, X_test,
  109. **{"n.tree":classification_params["n.tree"]})
  110. pred = (np.array(pred) >= 0.0).astype(np.float64)
  111. score = np.mean(pred == y_test)
  112.  
  113. return score
  114.  
  115.  
  116. def bench_arcene():
  117. X_train = np.loadtxt("/home/pprett/corpora/arcene/arcene_train.data")
  118. y_train = np.loadtxt("/home/pprett/corpora/arcene/arcene_train.labels")
  119. X_test = np.loadtxt("/home/pprett/corpora/arcene/arcene_valid.data")
  120. y_test = np.loadtxt("/home/pprett/corpora/arcene/arcene_valid.labels")
  121.  
  122. y_train[y_train == -1.0] = 0
  123. y_test[y_test == -1.0] = 0
  124.  
  125. X_train = numpy2ri(X_train)
  126. X_test = numpy2ri(X_test)
  127. y_train = numpy2ri(y_train)
  128.  
  129. model = gbm.gbm_fit(X_train, y_train, **classification_params)
  130. pred = gbm.predict_gbm(model, X_test,
  131. **{"n.tree":classification_params["n.tree"]})
  132. pred = (np.array(pred) >= 0.0).astype(np.float64)
  133. score = np.mean(pred == y_test)
  134.  
  135. return score
  136.  
  137.  
  138. regression_params = {"distribution": "gaussian", "shrinkage": 0.1,
  139. "n.tree": 100, "bag.fraction": 1.0, "verbose": False,
  140. "n.minobsinnode": 1, "interaction.depth": 4}
  141.  
  142. @repeat
  143. def bench_boston(random_state=None):
  144. boston = datasets.load_boston()
  145. X, y = shuffle(boston.data, boston.target, random_state=random_state)
  146. offset = int(X.shape[0] * 0.9)
  147. X_train = X[:offset]
  148. y_train = y[:offset]
  149. X_test = X[offset:]
  150. y_test = y[offset:]
  151.  
  152. X_train = numpy2ri(X_train)
  153. X_test = numpy2ri(X_test)
  154. y_train = numpy2ri(y_train)
  155.  
  156. model = gbm.gbm_fit(X_train, y_train, **regression_params)
  157. pred = gbm.predict_gbm(model, X_test,
  158. **{"n.tree":regression_params["n.tree"]})
  159. pred = np.array(pred, dtype=np.float64)
  160. mse = np.mean((pred - y_test) ** 2.0)
  161. return mse
  162.  
  163.  
  164. @repeat
  165. def bench_friedman1(random_state=None):
  166. X, y = datasets.make_friedman1(n_samples=1200,
  167. random_state=random_state, noise=1.0)
  168. X_train, y_train = X[:200], y[:200]
  169. X_test, y_test = X[200:], y[200:]
  170.  
  171. X_train = numpy2ri(X_train)
  172. X_test = numpy2ri(X_test)
  173. y_train = numpy2ri(y_train)
  174.  
  175. model = gbm.gbm_fit(X_train, y_train, **regression_params)
  176. pred = gbm.predict_gbm(model, X_test,
  177. **{"n.tree":regression_params["n.tree"]})
  178. pred = np.array(pred, dtype=np.float64)
  179. mse = np.mean((pred - y_test) ** 2.0)
  180. return mse
  181.  
  182.  
  183. @repeat
  184. def bench_friedman2(random_state=None):
  185. X, y = datasets.make_friedman2(n_samples=1200, random_state=random_state)
  186. X_train, y_train = X[:200], y[:200]
  187. X_test, y_test = X[200:], y[200:]
  188.  
  189. X_train = numpy2ri(X_train)
  190. X_test = numpy2ri(X_test)
  191. y_train = numpy2ri(y_train)
  192.  
  193. model = gbm.gbm_fit(X_train, y_train, **regression_params)
  194. pred = gbm.predict_gbm(model, X_test,
  195. **{"n.tree":regression_params["n.tree"]})
  196. pred = np.array(pred, dtype=np.float64)
  197.  
  198. mse = np.mean((pred - y_test) ** 2.0)
  199.  
  200. return mse
  201.  
  202.  
  203. @repeat
  204. def bench_friedman3(random_state=None):
  205. X, y = datasets.make_friedman3(n_samples=1200, random_state=random_state)
  206. X_train, y_train = X[:200], y[:200]
  207. X_test, y_test = X[200:], y[200:]
  208.  
  209. X_train = numpy2ri(X_train)
  210. X_test = numpy2ri(X_test)
  211. y_train = numpy2ri(y_train)
  212.  
  213. model = gbm.gbm_fit(X_train, y_train, **regression_params)
  214. pred = gbm.predict_gbm(model, X_test,
  215. **{"n.tree":regression_params["n.tree"]})
  216. pred = np.array(pred, dtype=np.float64)
  217.  
  218. mse = np.mean((pred - y_test) ** 2.0)
  219. return mse
  220.  
  221.  
  222. if __name__ == "__main__":
  223.  
  224. print "Example 10.2", bench_random_gaussian()
  225. print "spam", bench_spam()
  226.  
  227. print "Madelon", bench_madelon()
  228. print "Arcene", bench_arcene()
  229.  
  230. print "Boston", bench_boston()
  231. print "Friedman#1", bench_friedman1()
  232. print "Friedman#2", bench_friedman2()
  233. print "Friedman#3", bench_friedman3()
Add Comment
Please, Sign In to add comment