Advertisement
Guest User

Untitled

a guest
Nov 30th, 2017
197
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.98 KB | None | 0 0
  1. from sklearn.utils import indexable, check_random_state, safe_indexing
  2. from sklearn.utils.validation import _num_samples, column_or_1d
  3. import warnings
  4. from math import ceil, floor
  5. import numpy as np
  6. from . import Spliter
  7.  
  8.  
  9. class Categorizer:
  10.  
  11. def __init__ (self, lfw_people):
  12. self.lfw_people=lfw_people
  13.  
  14. def categorize(self):
  15.  
  16. # introspect the images arrays to find the shapes (for plotting)
  17. n_samples, h, w = self.lfw_people.images.shape
  18. # for machine learning we use the 2 data directly (as relative pixel
  19. # positions info is ignored by this model)
  20. X = self.lfw_people.data
  21. n_features = X.shape[1]
  22.  
  23. # the label to predict is the id of the person
  24. y = self.lfw_people.target
  25. target_names = self.lfw_people.target_names
  26. n_classes = target_names.shape[0]
  27.  
  28. print("Total dataset size:")
  29. print("n_samples: %d" % n_samples)
  30. print("n_features: %d" % n_features)
  31. print("n_classes: %d" % n_classes)
  32.  
  33.  
  34. # #############################################################################
  35. # Split into a training set and a test set using a stratified k fold
  36.  
  37. # split into a training and testing set
  38. X_train, X_test, y_train, y_test = train_test_split(
  39. X, y, test_size=0.25, random_state=42)
  40. return X_train, X_test, y_train, y_test
  41.  
  42.  
  43. def train_test_split(*arrays, **options):
  44. """Split arrays or matrices into random train and test subsets
  45.  
  46. Quick utility that wraps input validation and
  47. ``next(ShuffleSplit().split(X, y))`` and application to input data
  48. into a single call for splitting (and optionally subsampling) data in a
  49. oneliner.
  50.  
  51. Read more in the :ref:`User Guide <cross_validation>`.
  52.  
  53. Parameters
  54. ----------
  55. *arrays : sequence of indexables with same length / shape[0]
  56. Allowed inputs are lists, numpy arrays, scipy-sparse
  57. matrices or pandas dataframes.
  58.  
  59. test_size : float, int, None, optional
  60. If float, should be between 0.0 and 1.0 and represent the proportion
  61. of the dataset to include in the test split. If int, represents the
  62. absolute number of test samples. If None, the value is set to the
  63. complement of the train size. By default, the value is set to 0.25.
  64. The default will change in version 0.21. It will remain 0.25 only
  65. if ``train_size`` is unspecified, otherwise it will complement
  66. the specified ``train_size``.
  67.  
  68. train_size : float, int, or None, default None
  69. If float, should be between 0.0 and 1.0 and represent the
  70. proportion of the dataset to include in the train split. If
  71. int, represents the absolute number of train samples. If None,
  72. the value is automatically set to the complement of the test size.
  73.  
  74. random_state : int, RandomState instance or None, optional (default=None)
  75. If int, random_state is the seed used by the random number generator;
  76. If RandomState instance, random_state is the random number generator;
  77. If None, the random number generator is the RandomState instance used
  78. by `np.random`.
  79.  
  80. shuffle : boolean, optional (default=True)
  81. Whether or not to shuffle the data before splitting. If shuffle=False
  82. then stratify must be None.
  83.  
  84. stratify : array-like or None (default is None)
  85. If not None, data is split in a stratified fashion, using this as
  86. the class labels.
  87.  
  88. Returns
  89. -------
  90. splitting : list, length=2 * len(arrays)
  91. List containing train-test split of inputs.
  92.  
  93. .. versionadded:: 0.16
  94. If the input is sparse, the output will be a
  95. ``scipy.sparse.csr_matrix``. Else, output type is the same as the
  96. input type.
  97.  
  98. Examples
  99. --------
  100. >>> import numpy as np
  101. >>> from sklearn.model_selection import train_test_split
  102. >>> X, y = np.arange(10).reshape((5, 2)), range(5)
  103. >>> X
  104. array([[0, 1],
  105. [2, 3],
  106. [4, 5],
  107. [6, 7],
  108. [8, 9]])
  109. >>> list(y)
  110. [0, 1, 2, 3, 4]
  111.  
  112. >>> X_train, X_test, y_train, y_test = train_test_split(
  113. ... X, y, test_size=0.33, random_state=42)
  114. ...
  115. >>> X_train
  116. array([[4, 5],
  117. [0, 1],
  118. [6, 7]])
  119. >>> y_train
  120. [2, 0, 3]
  121. >>> X_test
  122. array([[2, 3],
  123. [8, 9]])
  124. >>> y_test
  125. [1, 4]
  126.  
  127. >>> train_test_split(y, shuffle=False)
  128. [[0, 1, 2], [3, 4]]
  129.  
  130. """
  131. n_arrays = len(arrays)
  132. if n_arrays == 0:
  133. raise ValueError("At least one array required as input")
  134. test_size = options.pop('test_size', 'default')
  135. train_size = options.pop('train_size', None)
  136. random_state = options.pop('random_state', None)
  137. stratify = options.pop('stratify', None)
  138. shuffle = options.pop('shuffle', True)
  139.  
  140. if options:
  141. raise TypeError("Invalid parameters passed: %s" % str(options))
  142.  
  143. if test_size == 'default':
  144. test_size = None
  145. if train_size is not None:
  146. warnings.warn("From version 0.21, test_size will always "
  147. "complement train_size unless both "
  148. "are specified.",
  149. FutureWarning)
  150.  
  151. if test_size is None and train_size is None:
  152. test_size = 0.25
  153.  
  154. arrays = indexable(*arrays)
  155.  
  156. if shuffle is False:
  157. if stratify is not None:
  158. raise ValueError(
  159. "Stratified train/test split is not implemented for "
  160. "shuffle=False")
  161.  
  162. n_samples = _num_samples(arrays[0])
  163. n_train, n_test = _validate_shuffle_split(n_samples, test_size,
  164. train_size)
  165.  
  166. train = np.arange(n_train)
  167. test = np.arange(n_train, n_train + n_test)
  168.  
  169. else:
  170. spliter = Spliter()
  171. if stratify is not None:
  172. CVClass = spliter.StratifiedShuffleSplit
  173. else:
  174. CVClass = spliter.ShuffleSplit
  175.  
  176. cv = CVClass(test_size=test_size,
  177. train_size=train_size,
  178. random_state=random_state)
  179.  
  180. train, test = next(cv.split(X=arrays[0], y=stratify))
  181.  
  182. return list(Spliter.chain.from_iterable((safe_indexing(a, train),
  183. safe_indexing(a, test)) for a in arrays))
  184.  
  185.  
  186. def _validate_shuffle_split(n_samples, test_size, train_size):
  187. """
  188. Validation helper to check if the test/test sizes are meaningful wrt to the
  189. size of the data (n_samples)
  190. """
  191. if (test_size is not None and
  192. np.asarray(test_size).dtype.kind == 'i' and
  193. test_size >= n_samples):
  194. raise ValueError('test_size=%d should be smaller than the number of '
  195. 'samples %d' % (test_size, n_samples))
  196.  
  197. if (train_size is not None and
  198. np.asarray(train_size).dtype.kind == 'i' and
  199. train_size >= n_samples):
  200. raise ValueError("train_size=%d should be smaller than the number of"
  201. " samples %d" % (train_size, n_samples))
  202.  
  203. if test_size == "default":
  204. test_size = 0.1
  205.  
  206. if np.asarray(test_size).dtype.kind == 'f':
  207. n_test = ceil(test_size * n_samples)
  208. elif np.asarray(test_size).dtype.kind == 'i':
  209. n_test = float(test_size)
  210.  
  211. if train_size is None:
  212. n_train = n_samples - n_test
  213. elif np.asarray(train_size).dtype.kind == 'f':
  214. n_train = floor(train_size * n_samples)
  215. else:
  216. n_train = float(train_size)
  217.  
  218. if test_size is None:
  219. n_test = n_samples - n_train
  220.  
  221. if n_train + n_test > n_samples:
  222. raise ValueError('The sum of train_size and test_size = %d, '
  223. 'should be smaller than the number of '
  224. 'samples %d. Reduce test_size and/or '
  225. 'train_size.' % (n_train + n_test, n_samples))
  226.  
  227. return int(n_train), int(n_test)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement