@@ -86,6 +86,21 @@ class PCA(sklearn.decomposition.PCA):
86
86
If None, the random number generator is the RandomState instance used
87
87
by `da.random`. Used when ``svd_solver`` == 'randomized'.
88
88
89
+ center : bool, optional (default True)
90
+ When True (the default), the underlying data gets centered at zero
91
+ by subtracting the mean of the data from the data itself.
92
+
93
+ PCA is performed on centered data due to its being a regression model,
94
+ without an intercept. As such, its principal components originate at the
95
+ origin of the transformed space.
96
+
97
+ ``center=False`` may be employed when performing PCA on already
98
+ centered data.
99
+
100
+ Since centering is a required step as part of whitening, ``center`` set
101
+ to False and ``whiten`` set to True is a combination which may result in
102
+ unexpected behavior, if performed on not previously centered data.
103
+
89
104
Attributes
90
105
----------
91
106
components_ : array, shape (n_components, n_features)
@@ -152,18 +167,27 @@ class PCA(sklearn.decomposition.PCA):
152
167
PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
153
168
svd_solver='auto', tol=0.0, whiten=False)
154
169
>>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS
155
- [ 0.99244... 0.00755... ]
170
+ [0.99244289 0.00755711 ]
156
171
>>> print(pca.singular_values_) # doctest: +ELLIPSIS
157
- [ 6.30061... 0.54980... ]
172
+ [6.30061232 0.54980396 ]
158
173
159
174
>>> pca = PCA(n_components=2, svd_solver='full')
160
175
>>> pca.fit(dX) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
161
176
PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
162
177
svd_solver='full', tol=0.0, whiten=False)
163
178
>>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS
164
- [ 0.99244... 0.00755...]
179
+ [0.99244289 0.00755711]
180
+ >>> print(pca.singular_values_) # doctest: +ELLIPSIS
181
+ [6.30061232 0.54980396]
182
+
183
+ >>> dX_mean_0 = dX - dX.mean(axis=0)
184
+ >>> pca = PCA(n_components=2, svd_solver='full', center=False)
185
+ >>> pca.fit(dX_mean_0)
186
+ PCA(center=False, n_components=2, svd_solver='full')
187
+ >>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS
188
+ [0.99244289 0.00755711]
165
189
>>> print(pca.singular_values_) # doctest: +ELLIPSIS
166
- [ 6.30061... 0.54980... ]
190
+ [6.30061232 0.54980396 ]
167
191
168
192
Notes
169
193
-----
@@ -175,6 +199,10 @@ class PCA(sklearn.decomposition.PCA):
175
199
``dask.linalg.svd_compressed``.
176
200
* n_components : ``n_components='mle'`` is not allowed.
177
201
Fractional ``n_components`` between 0 and 1 is not allowed.
202
+ * center : if ``True`` (the default), automatically center input data before
203
+ performing PCA.
204
+ Set this parameter to ``False``, if the input data have already been
205
+ centered before running ``fit()``.
178
206
"""
179
207
180
208
def __init__ (
@@ -186,10 +214,12 @@ def __init__(
186
214
tol = 0.0 ,
187
215
iterated_power = 0 ,
188
216
random_state = None ,
217
+ center = True ,
189
218
):
190
219
self .n_components = n_components
191
220
self .copy = copy
192
221
self .whiten = whiten
222
+ self .center = center
193
223
self .svd_solver = svd_solver
194
224
self .tol = tol
195
225
self .iterated_power = iterated_power
@@ -198,6 +228,7 @@ def __init__(
198
228
def fit (self , X , y = None ):
199
229
if not dask .is_dask_collection (X ):
200
230
raise TypeError (_TYPE_MSG .format (type (X )))
231
+
201
232
self ._fit (X )
202
233
self .n_features_in_ = X .shape [1 ]
203
234
return self
@@ -266,8 +297,10 @@ def _fit(self, X):
266
297
267
298
solver = self ._get_solver (X , n_components )
268
299
269
- self .mean_ = X .mean (0 )
270
- X -= self .mean_
300
+ self .mean_ = X .mean (axis = 0 )
301
+
302
+ if self .center :
303
+ X -= self .mean_
271
304
272
305
if solver in {"full" , "tsqr" }:
273
306
U , S , V = da .linalg .svd (X )
@@ -370,14 +403,20 @@ def transform(self, X):
370
403
X_new : array-like, shape (n_samples, n_components)
371
404
372
405
"""
373
- check_is_fitted (self , ["mean_" , "components_" ])
406
+ check_is_fitted (self , "components_" )
407
+
408
+ if self .whiten :
409
+ check_is_fitted (self , "explained_variance_" )
410
+
411
+ if self .center :
412
+ check_is_fitted (self , "mean_" )
413
+ if self .mean_ is not None :
414
+ X -= self .mean_
374
415
375
- # X = check_array(X)
376
- if self .mean_ is not None :
377
- X = X - self .mean_
378
416
X_transformed = da .dot (X , self .components_ .T )
379
417
if self .whiten :
380
418
X_transformed /= np .sqrt (self .explained_variance_ )
419
+
381
420
return X_transformed
382
421
383
422
def fit_transform (self , X , y = None ):
@@ -396,7 +435,6 @@ def fit_transform(self, X, y=None):
396
435
X_new : array-like, shape (n_samples, n_components)
397
436
398
437
"""
399
- # X = check_array(X)
400
438
if not dask .is_dask_collection (X ):
401
439
raise TypeError (_TYPE_MSG .format (type (X )))
402
440
U , S , V = self ._fit (X )
@@ -431,18 +469,25 @@ def inverse_transform(self, X):
431
469
If whitening is enabled, inverse_transform does not compute the
432
470
exact inverse operation of transform.
433
471
"""
434
- check_is_fitted (self , "mean_" )
472
+ check_is_fitted (self , "components_" )
473
+
474
+ if self .center :
475
+ check_is_fitted (self , "mean_" )
476
+ offset = self .mean_
477
+ else :
478
+ offset = 0
435
479
436
480
if self .whiten :
481
+ check_is_fitted (self , "explained_variance_" )
437
482
return (
438
483
da .dot (
439
484
X ,
440
485
np .sqrt (self .explained_variance_ [:, np .newaxis ]) * self .components_ ,
441
486
)
442
- + self . mean_
487
+ + offset
443
488
)
444
- else :
445
- return da .dot (X , self .components_ ) + self . mean_
489
+
490
+ return da .dot (X , self .components_ ) + offset
446
491
447
492
def score_samples (self , X ):
448
493
"""Return the log-likelihood of each sample.
@@ -463,8 +508,11 @@ def score_samples(self, X):
463
508
"""
464
509
check_is_fitted (self , "mean_" )
465
510
466
- # X = check_array(X)
467
- Xr = X - self .mean_
511
+ if self .center :
512
+ Xr = X - self .mean_
513
+ else :
514
+ Xr = X
515
+
468
516
n_features = X .shape [1 ]
469
517
precision = self .get_precision () # [n_features, n_features]
470
518
log_like = - 0.5 * (Xr * (da .dot (Xr , precision ))).sum (axis = 1 )
0 commit comments