Advertisement
Guest User

Untitled

a guest
Aug 15th, 2019
159
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.97 KB | None | 0 0
  1. from sklearn.preprocessing import LabelEncoder
  2. from sklearn.utils import column_or_1d
  3.  
  4. from sklearn.utils.validation import check_array
  5. from sklearn.utils.validation import check_is_fitted
  6. from sklearn.utils.validation import _num_samples
  7. from sklearn.utils.multiclass import unique_labels
  8. from sklearn.utils.multiclass import type_of_target
  9.  
  10.  
  11. def _encode_check_unknown(values, uniques, return_mask=False):
  12. """
  13. Helper function to check for unknowns in values to be encoded.
  14. Uses pure python method for object dtype, and numpy method for
  15. all other dtypes.
  16. Parameters
  17. ----------
  18. values : array
  19. Values to check for unknowns.
  20. uniques : array
  21. Allowed uniques values.
  22. return_mask : bool, default False
  23. If True, return a mask of the same shape as `values` indicating
  24. the valid values.
  25. Returns
  26. -------
  27. diff : list
  28. The unique values present in `values` and not in `uniques` (the
  29. unknown values).
  30. valid_mask : boolean array
  31. Additionally returned if ``return_mask=True``.
  32. """
  33. if values.dtype == object:
  34. uniques_set = set(uniques)
  35. diff = list(set(values) - uniques_set)
  36. if return_mask:
  37. if diff:
  38. valid_mask = np.array([val in uniques_set for val in values])
  39. else:
  40. valid_mask = np.ones(len(values), dtype=bool)
  41. return diff, valid_mask
  42. else:
  43. return diff
  44. else:
  45. unique_values = np.unique(values)
  46. diff = list(np.setdiff1d(unique_values, uniques, assume_unique=True))
  47. if return_mask:
  48. if diff:
  49. valid_mask = np.in1d(values, uniques)
  50. else:
  51. valid_mask = np.ones(len(values), dtype=bool)
  52. return diff, valid_mask
  53. else:
  54. return diff
  55.  
  56.  
  57.  
  58. def _encode_numpy(values, uniques=None, encode=False, check_unknown=True):
  59. # only used in _encode below, see docstring there for details
  60. if uniques is None:
  61. if encode:
  62. uniques, encoded = np.unique(values, return_inverse=True)
  63. return uniques, encoded
  64. else:
  65. # unique sorts
  66. return np.unique(values)
  67. if encode:
  68. if check_unknown:
  69. diff = _encode_check_unknown(values, uniques)
  70. if diff:
  71. raise ValueError("y contains previously unseen labels: %s"
  72. % str(diff))
  73. encoded = np.searchsorted(uniques, values)
  74. return uniques, encoded
  75. else:
  76. return uniques
  77.  
  78.  
  79. def _encode_python(values, uniques=None, encode=False):
  80. # only used in _encode below, see docstring there for details
  81. if uniques is None:
  82. uniques = sorted(set(values))
  83. uniques = np.array(uniques, dtype=values.dtype)
  84. if encode:
  85. table = {val: i for i, val in enumerate(uniques)}
  86. encoded = np.array([table.get(v, -1) for v in values])
  87. return uniques, encoded
  88. else:
  89. return uniques
  90.  
  91.  
  92. def _encode(values, uniques=None, encode=False, check_unknown=True):
  93. """Helper function to factorize (find uniques) and encode values.
  94. Uses pure python method for object dtype, and numpy method for
  95. all other dtypes.
  96. The numpy method has the limitation that the `uniques` need to
  97. be sorted. Importantly, this is not checked but assumed to already be
  98. the case. The calling method needs to ensure this for all non-object
  99. values.
  100. Parameters
  101. ----------
  102. values : array
  103. Values to factorize or encode.
  104. uniques : array, optional
  105. If passed, uniques are not determined from passed values (this
  106. can be because the user specified categories, or because they
  107. already have been determined in fit).
  108. encode : bool, default False
  109. If True, also encode the values into integer codes based on `uniques`.
  110. check_unknown : bool, default True
  111. If True, check for values in ``values`` that are not in ``unique``
  112. and raise an error. This is ignored for object dtype, and treated as
  113. True in this case. This parameter is useful for
  114. _BaseEncoder._transform() to avoid calling _encode_check_unknown()
  115. twice.
  116. Returns
  117. -------
  118. uniques
  119. If ``encode=False``. The unique values are sorted if the `uniques`
  120. parameter was None (and thus inferred from the data).
  121. (uniques, encoded)
  122. If ``encode=True``.
  123. """
  124. if values.dtype == object:
  125. try:
  126. res = _encode_python(values, uniques, encode)
  127. except TypeError:
  128. raise TypeError("argument must be a string or number")
  129. return res
  130. else:
  131. return _encode_numpy(values, uniques, encode,
  132. check_unknown=check_unknown)
  133.  
  134.  
  135. class CustomLabelEncoder(LabelEncoder):
  136. """Encode labels with value between 0 and n_classes-1.
  137. Read more in the :ref:`User Guide <preprocessing_targets>`.
  138. Attributes
  139. ----------
  140. classes_ : array of shape (n_class,)
  141. Holds the label for each class.
  142. Examples
  143. --------
  144. `LabelEncoder` can be used to normalize labels.
  145. >>> from sklearn import preprocessing
  146. >>> le = preprocessing.LabelEncoder()
  147. >>> le.fit([1, 2, 2, 6])
  148. LabelEncoder()
  149. >>> le.classes_
  150. array([1, 2, 6])
  151. >>> le.transform([1, 1, 2, 6])
  152. array([0, 0, 1, 2]...)
  153. >>> le.inverse_transform([0, 0, 1, 2])
  154. array([1, 1, 2, 6])
  155. It can also be used to transform non-numerical labels (as long as they are
  156. hashable and comparable) to numerical labels.
  157. >>> le = preprocessing.LabelEncoder()
  158. >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
  159. LabelEncoder()
  160. >>> list(le.classes_)
  161. ['amsterdam', 'paris', 'tokyo']
  162. >>> le.transform(["tokyo", "tokyo", "paris"])
  163. array([2, 2, 1]...)
  164. >>> list(le.inverse_transform([2, 2, 1]))
  165. ['tokyo', 'tokyo', 'paris']
  166. See also
  167. --------
  168. sklearn.preprocessing.OrdinalEncoder : encode categorical features
  169. using an ordinal encoding scheme.
  170. """
  171.  
  172. def fit(self, y):
  173. """Fit label encoder
  174. Parameters
  175. ----------
  176. y : array-like of shape (n_samples,)
  177. Target values.
  178. Returns
  179. -------
  180. self : returns an instance of self.
  181. """
  182. y = column_or_1d(y, warn=True)
  183. self.classes_ = _encode(y)
  184. return self
  185.  
  186. def fit_transform(self, y):
  187. """Fit label encoder and return encoded labels
  188. Parameters
  189. ----------
  190. y : array-like of shape [n_samples]
  191. Target values.
  192. Returns
  193. -------
  194. y : array-like of shape [n_samples]
  195. """
  196. y = column_or_1d(y, warn=True)
  197. self.classes_, y = _encode(y, encode=True)
  198. return y
  199.  
  200. def transform(self, y):
  201. """Transform labels to normalized encoding.
  202. Parameters
  203. ----------
  204. y : array-like of shape [n_samples]
  205. Target values.
  206. Returns
  207. -------
  208. y : array-like of shape [n_samples]
  209. """
  210. check_is_fitted(self, 'classes_')
  211. y = column_or_1d(y, warn=True)
  212. # transform of empty array is empty array
  213. if _num_samples(y) == 0:
  214. return np.array([])
  215.  
  216. _, y = _encode(y, uniques=self.classes_, encode=True)
  217. return y
  218.  
  219. def inverse_transform(self, y):
  220. """Transform labels back to original encoding.
  221. Parameters
  222. ----------
  223. y : numpy array of shape [n_samples]
  224. Target values.
  225. Returns
  226. -------
  227. y : numpy array of shape [n_samples]
  228. """
  229. check_is_fitted(self, 'classes_')
  230. y = column_or_1d(y, warn=True)
  231. # inverse transform of empty array is empty array
  232. if _num_samples(y) == 0:
  233. return np.array([])
  234.  
  235. diff = np.setdiff1d(y, np.arange(len(self.classes_)))
  236. if len(diff):
  237. raise ValueError(
  238. "y contains previously unseen labels: %s" % str(diff))
  239. y = np.asarray(y)
  240. return self.classes_[y]
  241.  
  242. def _more_tags(self):
  243. return {'X_types': ['1dlabels']}
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement