Advertisement
Guest User

Untitled

a guest
Jan 22nd, 2017
123
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 13.88 KB | None | 0 0
  1. # (C) Kyle Kastner, June 2014
  2. # License: BSD 3 clause
  3.  
  4. from sklearn.base import BaseEstimator, TransformerMixin
  5. from sklearn.utils import gen_batches
  6. from scipy.linalg import eigh
  7. from scipy.linalg import svd
  8. import numpy as np
  9.  
  10. # From sklearn master
  11. def svd_flip(u, v, u_based_decision=True):
  12. """Sign correction to ensure deterministic output from SVD.
  13.  
  14. Adjusts the columns of u and the rows of v such that the loadings in the
  15. columns in u that are largest in absolute value are always positive.
  16.  
  17. Parameters
  18. ----------
  19. u, v : ndarray
  20. u and v are the output of `linalg.svd` or
  21. `sklearn.utils.extmath.randomized_svd`, with matching inner dimensions
  22. so one can compute `np.dot(u * s, v)`.
  23.  
  24. u_based_decision : boolean, (default=True)
  25. If True, use the columns of u as the basis for sign flipping. Otherwise,
  26. use the rows of v. The choice of which variable to base the decision on
  27. is generally algorithm dependent.
  28.  
  29.  
  30. Returns
  31. -------
  32. u_adjusted, v_adjusted : arrays with the same dimensions as the input.
  33.  
  34. """
  35. if u_based_decision:
  36. # columns of u, rows of v
  37. max_abs_cols = np.argmax(np.abs(u), axis=0)
  38. signs = np.sign(u[max_abs_cols, xrange(u.shape[1])])
  39. u *= signs
  40. v *= signs[:, np.newaxis]
  41. else:
  42. # rows of v, columns of u
  43. max_abs_rows = np.argmax(np.abs(v), axis=1)
  44. signs = np.sign(v[xrange(v.shape[0]), max_abs_rows])
  45. u *= signs
  46. v *= signs[:, np.newaxis]
  47. return u, v
  48.  
  49.  
  50. def _batch_mean_variance_update(X, old_mean, old_variance, old_sample_count):
  51. """Calculate an average mean update and a Youngs and Cramer variance update.
  52.  
  53. From the paper "Algorithms for computing the sample variance: analysis and
  54. recommendations", by Chan, Golub, and LeVeque.
  55.  
  56. Parameters
  57. ----------
  58. X : array-like, shape (n_samples, n_features)
  59. Data to use for variance update
  60.  
  61. old_mean : array-like, shape: (n_features,)
  62.  
  63. old_variance : array-like, shape: (n_features,)
  64.  
  65. old_sample_count : int
  66.  
  67. Returns
  68. -------
  69. updated_mean : array, shape (n_features,)
  70.  
  71. updated_variance : array, shape (n_features,)
  72.  
  73. updated_sample_count : int
  74.  
  75. References
  76. ----------
  77. T. Chan, G. Golub, R. LeVeque. Algorithms for computing the sample variance:
  78. recommendations, The American Statistician, Vol. 37, No. 3, pp. 242-247
  79.  
  80. """
  81. new_sum = X.sum(axis=0)
  82. new_variance = X.var(axis=0) * X.shape[0]
  83. old_sum = old_mean * old_sample_count
  84. n_samples = X.shape[0]
  85. updated_sample_count = old_sample_count + n_samples
  86. partial_variance = old_sample_count / (n_samples * updated_sample_count) * (
  87. n_samples / old_sample_count * old_sum - new_sum) ** 2
  88. unnormalized_variance = old_variance * old_sample_count + new_variance + \
  89. partial_variance
  90. return ((old_sum + new_sum) / updated_sample_count,
  91. unnormalized_variance / updated_sample_count,
  92. updated_sample_count)
  93.  
  94.  
  95. class _CovZCA(BaseEstimator, TransformerMixin):
  96. def __init__(self, n_components=None, bias=.1, copy=True):
  97. self.n_components = n_components
  98. self.bias = bias
  99. self.copy = copy
  100.  
  101. def fit(self, X, y=None):
  102. if self.copy:
  103. X = np.array(X, copy=self.copy)
  104. n_samples, n_features = X.shape
  105. self.mean_ = np.mean(X, axis=0)
  106. X -= self.mean_
  107. U, S, VT = svd(np.dot(X.T, X) / n_samples, full_matrices=False)
  108. components = np.dot(VT.T * np.sqrt(1.0 / (S + self.bias)), VT)
  109. self.components_ = components[:self.n_components]
  110. return self
  111.  
  112. def transform(self, X):
  113. if self.copy:
  114. X = np.array(X, copy=self.copy)
  115. X -= self.mean_
  116. X_transformed = np.dot(X, self.components_.T)
  117. return X_transformed
  118.  
  119.  
  120. class ZCA(BaseEstimator, TransformerMixin):
  121. """
  122. Identical to CovZCA up to scaling due to lack of division by n_samples
  123. S ** 2 / n_samples should correct this but components_ come out different
  124. though transformed examples are identical.
  125. """
  126. def __init__(self, n_components=None, bias=.1, copy=True):
  127. self.n_components = n_components
  128. self.bias = bias
  129. self.copy = copy
  130.  
  131. def fit(self, X, y=None):
  132. if self.copy:
  133. X = np.array(X, copy=self.copy)
  134. n_samples, n_features = X.shape
  135. self.mean_ = np.mean(X, axis=0)
  136. X -= self.mean_
  137. U, S, VT = svd(X, full_matrices=False)
  138. components = np.dot(VT.T * np.sqrt(1.0 / (S ** 2 + self.bias)), VT)
  139. self.components_ = components[:self.n_components]
  140. return self
  141.  
  142. def transform(self, X):
  143. if self.copy:
  144. X = np.array(X, copy=self.copy)
  145. X = np.copy(X)
  146. X -= self.mean_
  147. X_transformed = np.dot(X, self.components_.T)
  148. return X_transformed
  149.  
  150. import scipy
  151. X = scipy.misc.lena()
  152. zca = ZCA()
  153. czca = _CovZCA()
  154. X_zca = zca.fit_transform(X)
  155. X_czca = czca.fit_transform(X)
  156. from IPython import embed; embed()
  157. raise ValueError()
  158.  
  159.  
  160. class IncrementalCovZCA(BaseEstimator, TransformerMixin):
  161. def __init__(self, n_components=None, batch_size=None, bias=.1,
  162. scale_by=1., copy=True):
  163. self.n_components = n_components
  164. self.batch_size = batch_size
  165. self.bias = bias
  166. self.scale_by = scale_by
  167. self.copy = copy
  168. self.scale_by = float(scale_by)
  169. self.mean_ = None
  170. self.covar_ = None
  171. self.n_samples_seen_ = 0.
  172.  
  173. def fit(self, X, y=None):
  174. self.mean_ = None
  175. self.covar_ = None
  176. self.n_samples_seen_ = 0.
  177. n_samples, n_features = X.shape
  178. if self.batch_size is None:
  179. self.batch_size_ = 5 * n_features
  180. else:
  181. self.batch_size_ = self.batch_size
  182. for batch in gen_batches(n_samples, self.batch_size_):
  183. self.partial_fit(X[batch])
  184. return self
  185.  
  186. def partial_fit(self, X):
  187. self.components_ = None
  188. if self.copy:
  189. X = np.array(X, copy=self.copy)
  190. X = np.copy(X)
  191. X /= self.scale_by
  192. n_samples, n_features = X.shape
  193. batch_mean = np.mean(X, axis=0)
  194. # Doing this without subtracting mean results in numerical instability
  195. # will have to play some games to work around this
  196. if self.mean_ is None:
  197. X -= batch_mean
  198. batch_covar = np.dot(X.T, X)
  199. self.mean_ = batch_mean
  200. self.covar_ = batch_covar
  201. self.n_samples_seen_ += float(n_samples)
  202. else:
  203. prev_mean = self.mean_
  204. prev_sample_count = self.n_samples_seen_
  205. prev_scale = self.n_samples_seen_ / (self.n_samples_seen_
  206. + n_samples)
  207. update_scale = n_samples / (self.n_samples_seen_ + n_samples)
  208. self.mean_ = self.mean_ * prev_scale + batch_mean * update_scale
  209.  
  210. X -= batch_mean
  211. # All of this correction is to minimize numerical instability in
  212. # the dot product
  213. batch_covar = np.dot(X.T, X)
  214. batch_offset = (self.mean_ - batch_mean)
  215. batch_adjustment = np.dot(batch_offset[None].T, batch_offset[None])
  216. batch_covar += batch_adjustment * n_samples
  217.  
  218. mean_offset = (self.mean_ - prev_mean)
  219. mean_adjustment = np.dot(mean_offset[None].T, mean_offset[None])
  220. self.covar_ += mean_adjustment * prev_sample_count
  221.  
  222. self.covar_ += batch_covar
  223. self.n_samples_seen_ += n_samples
  224.  
  225. def transform(self, X):
  226. if self.copy:
  227. X = np.array(X, copy=self.copy)
  228. X = np.copy(X)
  229. if self.components_ is None:
  230. U, S, VT = svd(self.covar_ / self.n_samples_seen_,
  231. full_matrices=False)
  232. components = np.dot(VT.T * np.sqrt(1.0 / (S + self.bias)), VT)
  233. self.components_ = components[:self.n_components]
  234. X /= self.scale_by
  235. X -= self.mean_
  236. X_transformed = np.dot(X, self.components_.T)
  237. return X_transformed
  238.  
  239.  
  240. class IncrementalZCA(BaseEstimator, TransformerMixin):
  241. def __init__(self, n_components=None, batch_size=None, bias=.1,
  242. scale_by=1., copy=True):
  243. self.n_components = n_components
  244. self.batch_size = batch_size
  245. self.bias = bias
  246. self.scale_by = scale_by
  247. self.copy = copy
  248. self.scale_by = float(scale_by)
  249. self.n_samples_seen_ = 0.
  250. self.mean_ = None
  251. self.var_ = None
  252. self.components_ = None
  253.  
  254. def fit(self, X, y=None):
  255. self.n_samples_seen_ = 0.
  256. self.mean_ = None
  257. self.var_ = None
  258. self.components_ = None
  259. n_samples, n_features = X.shape
  260. if self.batch_size is None:
  261. self.batch_size_ = 5 * n_features
  262. else:
  263. self.batch_size_ = self.batch_size
  264. for batch in gen_batches(n_samples, self.batch_size_):
  265. self.partial_fit(X[batch])
  266. return self
  267.  
  268. def partial_fit(self, X):
  269. if self.copy:
  270. X = np.array(X, copy=self.copy)
  271. X = np.copy(X)
  272. n_samples, n_features = X.shape
  273. self.n_components_ = self.n_components
  274. X /= self.scale_by
  275. if self.components_ is None:
  276. # This is the first pass through partial_fit
  277. self.n_samples_seen_ = 0.
  278. col_var = X.var(axis=0)
  279. col_mean = X.mean(axis=0)
  280. X -= col_mean
  281. U, S, V = svd(X, full_matrices=False)
  282. U, V = svd_flip(U, V, u_based_decision=False)
  283. else:
  284. col_batch_mean = X.mean(axis=0)
  285. col_mean, col_var, n_total_samples = _batch_mean_variance_update(
  286. X, self.mean_, self.var_, self.n_samples_seen_)
  287. X -= col_batch_mean
  288. # Build matrix of combined previous basis and new data
  289. correction = np.sqrt((self.n_samples_seen_ * n_samples)
  290. / n_total_samples)
  291. mean_correction = correction * (self.mean_ - col_batch_mean)
  292. X_combined = np.vstack((self.singular_values_.reshape((-1, 1)) *
  293. self.components_,
  294. X,
  295. mean_correction))
  296. U, S, V = svd(X_combined, full_matrices=False)
  297. U, V = svd_flip(U, V, u_based_decision=False)
  298.  
  299. self.n_samples_seen_ += n_samples
  300. self.components_ = V[:self.n_components_]
  301. self.singular_values_ = S[:self.n_components_]
  302. self.mean_ = col_mean
  303. self.var_ = col_var
  304. self.zca_components_ = np.dot(self.components_.T *
  305. np.sqrt(1.0 / (self.singular_values_ ** 2 + self.bias)), self.components_)
  306.  
  307. def transform(self, X):
  308. if self.copy:
  309. X = np.array(X, copy=self.copy)
  310. X = np.copy(X)
  311. X /= self.scale_by
  312. X -= self.mean_
  313. X_transformed = np.dot(X, self.zca_components_.T)
  314. return X_transformed
  315.  
  316.  
  317. if __name__ == "__main__":
  318. from numpy.testing import assert_almost_equal
  319. import matplotlib.pyplot as plt
  320. from scipy.misc import lena
  321. # scale_by is necessary otherwise float32 results are numerically unstable
  322. # scale_by is still not enough to totally eliminate the error in float32
  323. # for many, many iterations but it is very close
  324. X = lena().astype('float32')
  325. X_orig = np.copy(X)
  326.  
  327. # Check that covariance ZCA and data ZCA produce same results
  328. czca = CovZCA()
  329. zca = ZCA()
  330. X_czca = czca.fit_transform(X)
  331. X_zca = zca.fit_transform(X)
  332. assert_almost_equal(abs(zca.components_), abs(czca.components_), 3)
  333. raise ValueError()
  334.  
  335.  
  336. random_state = np.random.RandomState(1999)
  337. X = random_state.rand(2000, 512).astype('float64') * 255.
  338. X_orig = np.copy(X)
  339. scale_by = 1.
  340. from sklearn.decomposition import PCA, IncrementalPCA
  341. zca = ZCA(n_components=512, scale_by=scale_by)
  342. pca = PCA(n_components=512)
  343. izca = IncrementalZCA(n_components=512, batch_size=1000)
  344. ipca = IncrementalPCA(n_components=512, batch_size=1000)
  345. X_pca = pca.fit_transform(X)
  346. X_zca = zca.fit_transform(X)
  347. X_izca = izca.fit_transform(X)
  348. X_ipca = ipca.fit_transform(X)
  349. assert_almost_equal(abs(pca.components_), abs(ipca.components_), 3)
  350. from IPython import embed; embed()
  351. assert_almost_equal(abs(zca.components_), abs(izca.zca_components_), 3)
  352.  
  353. for batch_size in [512, 128]:
  354. print("Testing batch size %i" % batch_size)
  355. izca = IncrementalZCA(batch_size=batch_size, scale_by=scale_by)
  356. # Test that partial fit over subset has the same mean!
  357. zca.fit(X[:batch_size])
  358. izca.partial_fit(X[:batch_size])
  359. # Make sure data was not modified
  360. assert_almost_equal(X[:batch_size], X_orig[:batch_size])
  361. # Make sure single batch results match
  362. assert_almost_equal(zca.mean_, izca.mean_, decimal=3)
  363. print("Got here")
  364.  
  365. izca.fit(X[:100])
  366. izca.partial_fit(X[100:200])
  367. zca.fit(X[:200])
  368. # Make sure 2 batch results match
  369. assert_almost_equal(zca.mean_, izca.mean_, decimal=3)
  370. print("Got here 2")
  371. # Make sure the input array is not modified
  372. assert_almost_equal(X, X_orig, decimal=3)
  373. X_zca = zca.fit_transform(X)
  374. X_izca = izca.fit_transform(X)
  375. # Make sure the input array is not modified
  376. assert_almost_equal(X, X_orig, decimal=3)
  377. print("Got here 3")
  378. # Make sure the means are equal
  379. assert_almost_equal(zca.mean_, izca.mean_, decimal=3)
  380. print("Got here 4")
  381. # Make sure the components are equal
  382. assert_almost_equal(X_zca, X_izca, decimal=3)
  383. plt.imshow(X, cmap="gray")
  384. plt.title("Original")
  385. plt.figure()
  386. plt.imshow(X_zca, cmap="gray")
  387. plt.title("ZCA")
  388. plt.figure()
  389. plt.imshow(X_izca, cmap="gray")
  390. plt.title("IZCA")
  391. plt.figure()
  392. plt.matshow(zca.components_)
  393. plt.title("ZCA")
  394. plt.figure()
  395. plt.matshow(izca.components_)
  396. plt.title("IZCA")
  397. plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement