Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- """
- openml_rerf_test.py
- """
- import sys
- import openml
- import argparse
- import numpy as np
- import pandas as pd
- import sklearn
- from sklearn import compose, impute, feature_selection
- from sklearn.model_selection import train_test_split
- from sklearn.ensemble import RandomForestClassifier
- from RerF import fastRerF, fastPredict
- # --
- # CLI
- def parse_args():
- parser = argparse.ArgumentParser()
- parser.add_argument('--task-id', type=int, default=3)
- parser.add_argument('--num-cores', type=int, default=16)
- parser.add_argument('--num-trees', type=int, default=500)
- parser.add_argument('--seed', type=int, default=123)
- return parser.parse_args()
- args = parse_args()
- np.random.seed(args.seed)
- # --
- # Load dataset
- task = openml.tasks.get_task(args.task_id)
- X, y = task.get_X_and_y()
- # Use first split (for now)
- train_idx, test_idx = task.get_train_test_split_indices()
- X_train, X_test = X[train_idx], X[test_idx]
- y_train, y_test = y[train_idx], y[test_idx]
- # --
- # Preprocess data
- dataset = task.get_dataset()
- nominal_indices = dataset.get_features_by_type(data_type='nominal', exclude=[task.target_name])
- numeric_indices = dataset.get_features_by_type(data_type='numeric', exclude=[task.target_name])
- prep = sklearn.pipeline.make_pipeline(
- sklearn.compose.ColumnTransformer(
- transformers=[
- ('numeric', sklearn.pipeline.make_pipeline(
- sklearn.preprocessing.Imputer(),
- sklearn.preprocessing.StandardScaler(),
- ), numeric_indices),
- ('nominal', sklearn.pipeline.make_pipeline(
- sklearn.impute.SimpleImputer(strategy='constant', fill_value=-1),
- sklearn.preprocessing.OneHotEncoder(handle_unknown='ignore'),
- ), nominal_indices)
- ],
- remainder='passthrough',
- ),
- sklearn.feature_selection.VarianceThreshold(),
- )
- Xf_train = prep.fit_transform(X_train)
- Xf_test = prep.transform(X_test)
- # --
- # Train models
- def fit_rerf(Xf_train, Xf_test, y_train, y_test, num_trees, num_cores):
- rerf_forest = fastRerF(
- X=Xf_train,
- Y=y_train,
- forestType="binnedBaseRerF",
- trees=num_trees,
- numCores=num_cores,
- )
- return fastPredict(X=Xf_test, forest=rerf_forest)
- def fit_sklearn(Xf_train, Xf_test, y_train, y_test, num_trees, num_cores):
- sk_forest = RandomForestClassifier(n_estimators=num_trees, n_jobs=num_cores)
- sk_forest = sk_forest.fit(Xf_train, y_train)
- return sk_forest.predict(Xf_test)
- kwargs = {
- "Xf_train" : Xf_train,
- "Xf_test" : Xf_test,
- "y_train" : y_train,
- "y_test" : y_test,
- "num_trees" : args.num_trees,
- "num_cores" : args.num_cores
- }
- print('-' * 50, file=sys.stderr)
- print('fit rerf', file=sys.stderr)
- rerf_pred = [fit_rerf(**kwargs) for _ in range(10)]
- rerf_accs = [(y_test == p).mean() for p in rerf_pred]
- print('-' * 50, file=sys.stderr)
- print('fit sklearn', file=sys.stderr)
- sk_pred = [fit_sklearn(**kwargs) for _ in range(10)]
- sk_accs = [(y_test == p).mean() for p in sk_pred]
- print('np.mean(rerf_accs)', np.mean(rerf_accs))
- print('np.mean(sk_accs)', np.mean(sk_accs))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement