Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- """
- Benchmark script to bench R's gbm package via rpy2.
- NOTE::
- make sure you run
- $ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib64/R/lib
- """
- import numpy as np
- import rpy2
- from time import time
- from sklearn import datasets
- from sklearn.utils import shuffle
- from sklearn.utils import check_random_state
- from rpy2.robjects.numpy2ri import numpy2ri
- from rpy2.robjects.packages import importr
- import pylab as pl
- gbm = importr('gbm')
- def repeat(f):
- def wrapper(*args, **kargs):
- scores = []
- for i in range(10):
- scores.append(f(*args, random_state=i, **kargs))
- scores = np.array(scores)
- return scores.mean(axis=0), scores.std(axis=0)
- return wrapper
- # ignore overflows due to exp
- #np.seterr(invalid='print', under='print', divide='print', over='ignore')
- classification_params = {"distribution": "bernoulli", "shrinkage": 1.0,
- "n.tree": 500, "bag.fraction": 0.5, "verbose": False,
- "n.minobsinnode": 1, "interaction.depth": 1}
- @repeat
- def bench_random_gaussian(random_state=None):
- rs = check_random_state(random_state)
- shape = (12000, 10)
- X = rs.normal(size=shape).reshape(shape)
- y = ((X ** 2.0).sum(axis=1) > 9.34).astype(np.float64)
- X_train, X_test = X[:2000], X[2000:]
- y_train, y_test = y[:2000], y[2000:]
- X_train = numpy2ri(X_train)
- X_test = numpy2ri(X_test)
- y_train = numpy2ri(y_train)
- model = gbm.gbm_fit(X_train, y_train, **classification_params)
- pred = gbm.predict_gbm(model, X_test,
- **{"n.tree":classification_params["n.tree"]})
- pred = (np.array(pred) >= 0.0).astype(np.float64)
- error_rate = np.mean(pred != y_test)
- return error_rate
- @repeat
- def bench_spam(random_state=None):
- X = np.loadtxt("/home/pprett/corpora/spam/spambase.data", delimiter=",")
- y = X[:, -1].ravel()
- X = X[:, :-1]
- f = open("/home/pprett/corpora/spam/spambase.names")
- feature_names = np.array([l.split(":")[0] for l in f])
- X, y = shuffle(X, y, random_state=random_state)
- X_test, y_test = X[:1536], y[:1536]
- X_train, y_train = X[1536:], y[1536:]
- y_train[y_train == -1.0] = 0
- y_test[y_test == -1.0] = 0
- X_train = numpy2ri(X_train)
- X_test = numpy2ri(X_test)
- y_train = numpy2ri(y_train)
- model = gbm.gbm_fit(X_train, y_train, **classification_params)
- pred = gbm.predict_gbm(model, X_test,
- **{"n.tree":classification_params["n.tree"]})
- pred = (np.array(pred) >= 0.0).astype(np.float64)
- error_rate = np.mean(pred != y_test)
- return error_rate
- def bench_madelon():
- X_train = np.loadtxt("/home/pprett/corpora/madelon/madelon_train.data")
- y_train = np.loadtxt("/home/pprett/corpora/madelon/madelon_train.labels")
- X_test = np.loadtxt("/home/pprett/corpora/madelon/madelon_valid.data")
- y_test = np.loadtxt("/home/pprett/corpora/madelon/madelon_valid.labels")
- y_train[y_train == -1] = 0
- y_test[y_test == -1] = 0
- X_train = numpy2ri(X_train)
- X_test = numpy2ri(X_test)
- y_train = numpy2ri(y_train)
- model = gbm.gbm_fit(X_train, y_train, **classification_params)
- pred = gbm.predict_gbm(model, X_test,
- **{"n.tree":classification_params["n.tree"]})
- pred = (np.array(pred) >= 0.0).astype(np.float64)
- score = np.mean(pred == y_test)
- return score
- def bench_arcene():
- X_train = np.loadtxt("/home/pprett/corpora/arcene/arcene_train.data")
- y_train = np.loadtxt("/home/pprett/corpora/arcene/arcene_train.labels")
- X_test = np.loadtxt("/home/pprett/corpora/arcene/arcene_valid.data")
- y_test = np.loadtxt("/home/pprett/corpora/arcene/arcene_valid.labels")
- y_train[y_train == -1.0] = 0
- y_test[y_test == -1.0] = 0
- X_train = numpy2ri(X_train)
- X_test = numpy2ri(X_test)
- y_train = numpy2ri(y_train)
- model = gbm.gbm_fit(X_train, y_train, **classification_params)
- pred = gbm.predict_gbm(model, X_test,
- **{"n.tree":classification_params["n.tree"]})
- pred = (np.array(pred) >= 0.0).astype(np.float64)
- score = np.mean(pred == y_test)
- return score
- regression_params = {"distribution": "gaussian", "shrinkage": 0.1,
- "n.tree": 100, "bag.fraction": 1.0, "verbose": False,
- "n.minobsinnode": 1, "interaction.depth": 4}
- @repeat
- def bench_boston(random_state=None):
- boston = datasets.load_boston()
- X, y = shuffle(boston.data, boston.target, random_state=random_state)
- offset = int(X.shape[0] * 0.9)
- X_train = X[:offset]
- y_train = y[:offset]
- X_test = X[offset:]
- y_test = y[offset:]
- X_train = numpy2ri(X_train)
- X_test = numpy2ri(X_test)
- y_train = numpy2ri(y_train)
- model = gbm.gbm_fit(X_train, y_train, **regression_params)
- pred = gbm.predict_gbm(model, X_test,
- **{"n.tree":regression_params["n.tree"]})
- pred = np.array(pred, dtype=np.float64)
- mse = np.mean((pred - y_test) ** 2.0)
- return mse
- @repeat
- def bench_friedman1(random_state=None):
- X, y = datasets.make_friedman1(n_samples=1200,
- random_state=random_state, noise=1.0)
- X_train, y_train = X[:200], y[:200]
- X_test, y_test = X[200:], y[200:]
- X_train = numpy2ri(X_train)
- X_test = numpy2ri(X_test)
- y_train = numpy2ri(y_train)
- model = gbm.gbm_fit(X_train, y_train, **regression_params)
- pred = gbm.predict_gbm(model, X_test,
- **{"n.tree":regression_params["n.tree"]})
- pred = np.array(pred, dtype=np.float64)
- mse = np.mean((pred - y_test) ** 2.0)
- return mse
- @repeat
- def bench_friedman2(random_state=None):
- X, y = datasets.make_friedman2(n_samples=1200, random_state=random_state)
- X_train, y_train = X[:200], y[:200]
- X_test, y_test = X[200:], y[200:]
- X_train = numpy2ri(X_train)
- X_test = numpy2ri(X_test)
- y_train = numpy2ri(y_train)
- model = gbm.gbm_fit(X_train, y_train, **regression_params)
- pred = gbm.predict_gbm(model, X_test,
- **{"n.tree":regression_params["n.tree"]})
- pred = np.array(pred, dtype=np.float64)
- mse = np.mean((pred - y_test) ** 2.0)
- return mse
- @repeat
- def bench_friedman3(random_state=None):
- X, y = datasets.make_friedman3(n_samples=1200, random_state=random_state)
- X_train, y_train = X[:200], y[:200]
- X_test, y_test = X[200:], y[200:]
- X_train = numpy2ri(X_train)
- X_test = numpy2ri(X_test)
- y_train = numpy2ri(y_train)
- model = gbm.gbm_fit(X_train, y_train, **regression_params)
- pred = gbm.predict_gbm(model, X_test,
- **{"n.tree":regression_params["n.tree"]})
- pred = np.array(pred, dtype=np.float64)
- mse = np.mean((pred - y_test) ** 2.0)
- return mse
- if __name__ == "__main__":
- print "Example 10.2", bench_random_gaussian()
- print "spam", bench_spam()
- print "Madelon", bench_madelon()
- print "Arcene", bench_arcene()
- print "Boston", bench_boston()
- print "Friedman#1", bench_friedman1()
- print "Friedman#2", bench_friedman2()
- print "Friedman#3", bench_friedman3()
Add Comment
Please, Sign In to add comment