Untitled

"""
An example for adapting a non-predicting estimator (i.e.,
one that doesn't expose a public ``predict`` method, but
only a ``fit_predict`` one), such as ``LocalOutlierFactor``
to make predictions on "unseen" data.

One could argue whether or not this particular approach is
entirely legitimate, as not exposing a ``predict`` method,
in most sensible cases will have been due to design and
semantic constraints.

Nevertheless, for the adventurous crowd out there, I've
provided a rudimentary means of making predictions on
"unseen" data via sub-classing the estimator in question.
"""

import numpy as np
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.neighbors import LocalOutlierFactor


SEED = 42

class LOFPredictor(LocalOutlierFactor):
    def predict(self, X=None):
        return self._predict(X)

rng = np.random.RandomState(SEED)

# Example settings
n_samples = 200
true_outliers_fraction = 0.25
offset = 2

xx, yy = np.meshgrid(np.linspace(-7, 7, n_samples / 2),
                     np.linspace(-7, 7, n_samples / 2))
n_outliers = int(true_outliers_fraction * n_samples)
n_inliers = n_samples - n_outliers
y_true = np.ones(n_samples, dtype=int)
y_true[-n_outliers:] = -1

np.random.seed(SEED)
# Data generation
X1 = 0.3 * np.random.randn(n_inliers // 2, 2) - offset
X2 = 0.3 * np.random.randn(n_inliers // 2, 2) + offset
X = np.concatenate([X1, X2], axis=0)
# Add outliers
X = np.concatenate([X, np.random.uniform(low=-6, high=6,
                   size=(n_outliers, 2))], axis=0)

outliers_fraction = .25
kfold = KFold(n_splits=3, shuffle=True, random_state=42)

param_grid = [
    {
        'n_neighbors': (25, 29, 35),
        'contamination': (.25, .27, .3),
    },
]

clf = GridSearchCV(LOFPredictor(), param_grid=param_grid, scoring="accuracy",
                   cv=kfold, n_jobs=-1)
clf.fit(X, y_true)

print("Best params: {}".format(clf.best_params_))