Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from sklearn.preprocessing import LabelEncoder
- from sklearn.utils import column_or_1d
- from sklearn.utils.validation import check_array
- from sklearn.utils.validation import check_is_fitted
- from sklearn.utils.validation import _num_samples
- from sklearn.utils.multiclass import unique_labels
- from sklearn.utils.multiclass import type_of_target
- def _encode_check_unknown(values, uniques, return_mask=False):
- """
- Helper function to check for unknowns in values to be encoded.
- Uses pure python method for object dtype, and numpy method for
- all other dtypes.
- Parameters
- ----------
- values : array
- Values to check for unknowns.
- uniques : array
- Allowed uniques values.
- return_mask : bool, default False
- If True, return a mask of the same shape as `values` indicating
- the valid values.
- Returns
- -------
- diff : list
- The unique values present in `values` and not in `uniques` (the
- unknown values).
- valid_mask : boolean array
- Additionally returned if ``return_mask=True``.
- """
- if values.dtype == object:
- uniques_set = set(uniques)
- diff = list(set(values) - uniques_set)
- if return_mask:
- if diff:
- valid_mask = np.array([val in uniques_set for val in values])
- else:
- valid_mask = np.ones(len(values), dtype=bool)
- return diff, valid_mask
- else:
- return diff
- else:
- unique_values = np.unique(values)
- diff = list(np.setdiff1d(unique_values, uniques, assume_unique=True))
- if return_mask:
- if diff:
- valid_mask = np.in1d(values, uniques)
- else:
- valid_mask = np.ones(len(values), dtype=bool)
- return diff, valid_mask
- else:
- return diff
- def _encode_numpy(values, uniques=None, encode=False, check_unknown=True):
- # only used in _encode below, see docstring there for details
- if uniques is None:
- if encode:
- uniques, encoded = np.unique(values, return_inverse=True)
- return uniques, encoded
- else:
- # unique sorts
- return np.unique(values)
- if encode:
- if check_unknown:
- diff = _encode_check_unknown(values, uniques)
- if diff:
- raise ValueError("y contains previously unseen labels: %s"
- % str(diff))
- encoded = np.searchsorted(uniques, values)
- return uniques, encoded
- else:
- return uniques
- def _encode_python(values, uniques=None, encode=False):
- # only used in _encode below, see docstring there for details
- if uniques is None:
- uniques = sorted(set(values))
- uniques = np.array(uniques, dtype=values.dtype)
- if encode:
- table = {val: i for i, val in enumerate(uniques)}
- encoded = np.array([table.get(v, -1) for v in values])
- return uniques, encoded
- else:
- return uniques
- def _encode(values, uniques=None, encode=False, check_unknown=True):
- """Helper function to factorize (find uniques) and encode values.
- Uses pure python method for object dtype, and numpy method for
- all other dtypes.
- The numpy method has the limitation that the `uniques` need to
- be sorted. Importantly, this is not checked but assumed to already be
- the case. The calling method needs to ensure this for all non-object
- values.
- Parameters
- ----------
- values : array
- Values to factorize or encode.
- uniques : array, optional
- If passed, uniques are not determined from passed values (this
- can be because the user specified categories, or because they
- already have been determined in fit).
- encode : bool, default False
- If True, also encode the values into integer codes based on `uniques`.
- check_unknown : bool, default True
- If True, check for values in ``values`` that are not in ``unique``
- and raise an error. This is ignored for object dtype, and treated as
- True in this case. This parameter is useful for
- _BaseEncoder._transform() to avoid calling _encode_check_unknown()
- twice.
- Returns
- -------
- uniques
- If ``encode=False``. The unique values are sorted if the `uniques`
- parameter was None (and thus inferred from the data).
- (uniques, encoded)
- If ``encode=True``.
- """
- if values.dtype == object:
- try:
- res = _encode_python(values, uniques, encode)
- except TypeError:
- raise TypeError("argument must be a string or number")
- return res
- else:
- return _encode_numpy(values, uniques, encode,
- check_unknown=check_unknown)
- class CustomLabelEncoder(LabelEncoder):
- """Encode labels with value between 0 and n_classes-1.
- Read more in the :ref:`User Guide <preprocessing_targets>`.
- Attributes
- ----------
- classes_ : array of shape (n_class,)
- Holds the label for each class.
- Examples
- --------
- `LabelEncoder` can be used to normalize labels.
- >>> from sklearn import preprocessing
- >>> le = preprocessing.LabelEncoder()
- >>> le.fit([1, 2, 2, 6])
- LabelEncoder()
- >>> le.classes_
- array([1, 2, 6])
- >>> le.transform([1, 1, 2, 6])
- array([0, 0, 1, 2]...)
- >>> le.inverse_transform([0, 0, 1, 2])
- array([1, 1, 2, 6])
- It can also be used to transform non-numerical labels (as long as they are
- hashable and comparable) to numerical labels.
- >>> le = preprocessing.LabelEncoder()
- >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
- LabelEncoder()
- >>> list(le.classes_)
- ['amsterdam', 'paris', 'tokyo']
- >>> le.transform(["tokyo", "tokyo", "paris"])
- array([2, 2, 1]...)
- >>> list(le.inverse_transform([2, 2, 1]))
- ['tokyo', 'tokyo', 'paris']
- See also
- --------
- sklearn.preprocessing.OrdinalEncoder : encode categorical features
- using an ordinal encoding scheme.
- """
- def fit(self, y):
- """Fit label encoder
- Parameters
- ----------
- y : array-like of shape (n_samples,)
- Target values.
- Returns
- -------
- self : returns an instance of self.
- """
- y = column_or_1d(y, warn=True)
- self.classes_ = _encode(y)
- return self
- def fit_transform(self, y):
- """Fit label encoder and return encoded labels
- Parameters
- ----------
- y : array-like of shape [n_samples]
- Target values.
- Returns
- -------
- y : array-like of shape [n_samples]
- """
- y = column_or_1d(y, warn=True)
- self.classes_, y = _encode(y, encode=True)
- return y
- def transform(self, y):
- """Transform labels to normalized encoding.
- Parameters
- ----------
- y : array-like of shape [n_samples]
- Target values.
- Returns
- -------
- y : array-like of shape [n_samples]
- """
- check_is_fitted(self, 'classes_')
- y = column_or_1d(y, warn=True)
- # transform of empty array is empty array
- if _num_samples(y) == 0:
- return np.array([])
- _, y = _encode(y, uniques=self.classes_, encode=True)
- return y
- def inverse_transform(self, y):
- """Transform labels back to original encoding.
- Parameters
- ----------
- y : numpy array of shape [n_samples]
- Target values.
- Returns
- -------
- y : numpy array of shape [n_samples]
- """
- check_is_fitted(self, 'classes_')
- y = column_or_1d(y, warn=True)
- # inverse transform of empty array is empty array
- if _num_samples(y) == 0:
- return np.array([])
- diff = np.setdiff1d(y, np.arange(len(self.classes_)))
- if len(diff):
- raise ValueError(
- "y contains previously unseen labels: %s" % str(diff))
- y = np.asarray(y)
- return self.classes_[y]
- def _more_tags(self):
- return {'X_types': ['1dlabels']}
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement