diff --git a/xeofs/models/eof.py b/xeofs/models/eof.py index 6320648..4a56336 100644 --- a/xeofs/models/eof.py +++ b/xeofs/models/eof.py @@ -13,6 +13,69 @@ class EOF(_EOF_base): + '''EOF analysis of a single ``np.ndarray``. + + Parameters + ---------- + X : np.ndarray + Data to be decpomposed. ``X`` can be any N-dimensional array with the + first dimension containing the variable whose variance is to be + maximised. All remaining dimensions will be automatically reshaped to + obtain a 2D matrix. For example, given data of `n` time series of ``p1`` different + variables each measured at ``p2`` different locations, the data matrix + ``X`` of dimensions ``(n x p1 x p2)`` will maximise `temporal` variance. + In contrast, if provided as ``(p2 x n x p1)``, the `spatial` variance + will be maximised. + n_modes : Optional[int] + Number of modes to compute. Computing less modes can results in + performance gains. If None, then the maximum number of modes is + equivalent to ``min(n_samples, n_features)`` (the default is None). + norm : bool + Normalize each feature (e.g. grid cell) by its temporal standard + deviation (the default is False). + + + Examples + -------- + + Import package and create data: + + >>> import numpy as np + >>> from xeofs.models import EOF + >>> rng = np.random.default_rng(7) + >>> X = rng.standard_normal((4, 2, 3)) + + Initialize standardized EOF analysis and compute the first 2 modes: + + >>> model = EOF(X, norm=True, n_modes=2) + >>> model.solve() + + Get explained variance: + + >>> model.explained_variance() + ... array([4.81848833, 2.20765019]) + + Get EOFs: + + >>> model.eofs() + ... array([[[ 0.52002146, -0.00698575], + ... [ 0.41059796, -0.31603563]], + ... + ... [[ 0.32091783, 0.47647144], + ... [-0.51771611, 0.01380736]], + ... + ... [[-0.28840959, 0.63341346], + ... [ 0.32678537, 0.52119516]]]) + + Get PCs: + + >>> model.pcs() + ... array([[ 1.76084189, -0.92030205], + ... [-2.13581896, -1.49704775], + ... [ 2.02091078, 0.65502667], + ... [-1.64593371, 1.76232312]]) + + ''' def __init__( self, @@ -29,12 +92,31 @@ def __init__( super().__init__( X=self.X, - Y=None, n_modes=n_modes, norm=norm ) - def solve(self): + def solve(self) -> None: + ''' + Perform the EOF analysis. + + To boost performance, the standard solver is based on + the PCA implementation of scikit-learn [1]_ which uses different algorithms + to perform the decomposition based on the data matrix size. + + Naive approaches using singular value decomposition of the + data matrix ``X (n x p)`` or the covariance matrix ``C (p x p)`` + quickly become infeasable computationally when the number of + samples :math:`n` or features :math:`p` increase (computational power increases + by :math:`O(n^2p)` and :math:`O(p^3)`, respectively.) + + + References + ---------- + .. [1] https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html + + ''' + pca = PCA(n_components=self.n_modes) self._pcs = pca.fit_transform(self.X) self._singular_values = pca.singular_values_ @@ -48,17 +130,58 @@ def solve(self): self._eofs *= flip_signs self._pcs *= flip_signs - def singular_values(self): + def singular_values(self) -> np.ndarray: + '''Get the singular values. + + The `i` th singular value :math:`\sigma_i` is defined by + + .. math:: + \sigma_i = \sqrt{n \lambda_i} + + where :math:`\lambda_i` and :math:`n` are the associated eigenvalues + and the number of samples, respectively. + + ''' + return self._singular_values def explained_variance(self): + '''Get the explained variance. + + The explained variance is simply given by the individual eigenvalues + of the covariance matrix. + + ''' + return self._explained_variance def explained_variance_ratio(self): + '''Get the explained variance ratio. + + The explained variance ratio is the fraction of total variance + explained by a given mode and is calculated by :math:`\lambda_i / \sum_i^m \lambda_i` + where `m` is the total number of modes. + + ''' + return self._explained_variance_ratio def eofs(self): + '''Get the EOFs. + + The empirical orthogonal functions (EOFs) are equivalent to the eigenvectors + of the covariance matrix of `X`. + + ''' + return self._arr_tf.back_transform(self._eofs.T).T def pcs(self): + '''Get the PCs. + + The principal components (PCs), also known as PC scores, are computed + by projecting the data matrix `X` onto the eigenvectors. + + ''' + return self._pcs diff --git a/xeofs/pandas/eof.py b/xeofs/pandas/eof.py index 585c306..a18c18c 100644 --- a/xeofs/pandas/eof.py +++ b/xeofs/pandas/eof.py @@ -8,6 +8,64 @@ class EOF(models.eof.EOF): + '''EOF analysis of a single ``pd.DataFrame``. + + Parameters + ---------- + X : pd.DataFrame + Data to be decpomposed. First dimension contains the variable whose variance is to be + maximised. + n_modes : Optional[int] + Number of modes to compute. Computing less modes can results in + performance gains. If None, then the maximum number of modes is + equivalent to ``min(n_samples, n_features)`` (the default is None). + norm : bool + Normalize each feature (e.g. grid cell) by its temporal standard + deviation (the default is False). + + + Examples + -------- + + Import package and create data: + + >>> import pandas as pd + >>> from xeofs.pandas import EOF + >>> rng = np.random.default_rng(7) + >>> X = rng.standard_normal((4, 3)) + >>> df = pd.DataFrame(X) + + Initialize standardized EOF analysis and compute the first 2 modes: + + >>> model = EOF(df, norm=True, n_modes=2) + >>> model.solve() + + Get explained variance: + + >>> model.explained_variance() + ... explained_variance + ... mode + ... 1 2.562701 + ... 2 1.167054 + + Get EOFs: + + >>> model.eofs() + ... mode 1 2 + ... 0 0.626041 -0.428431 + ... 1 0.677121 -0.115737 + ... 2 0.386755 0.896132 + + Get PCs: + + >>> model.pcs() + ... mode 1 2 + ... 0 0.495840 -0.221963 + ... 1 -2.254508 -0.470420 + ... 2 1.516900 -0.876695 + ... 3 0.241768 1.569078 + + ''' def __init__( self, @@ -24,7 +82,6 @@ def __init__( super().__init__( X=X, - Y=None, n_modes=n_modes, norm=norm ) diff --git a/xeofs/xarray/eof.py b/xeofs/xarray/eof.py index f4e38f4..7c6f675 100644 --- a/xeofs/xarray/eof.py +++ b/xeofs/xarray/eof.py @@ -8,6 +8,88 @@ class EOF(models.eof.EOF): + '''EOF analysis of a single ``xr.DataArray``. + + Parameters + ---------- + X : xr.DataArray + Data to be decomposed. + n_modes : Optional[int] + Number of modes to compute. Computing less modes can results in + performance gains. If None, then the maximum number of modes is + equivalent to ``min(n_samples, n_features)`` (the default is None). + norm : bool + Normalize each feature (e.g. grid cell) by its temporal standard + deviation (the default is False). + dim : str + Define the dimension which should considered for maximising variance. + For most applications in climate science, temporal variance is + maximised (also known as S-mode EOF analysis) i.e. the time dimension + should be chosen (the default is ``time``). + + Examples + -------- + + Import package and create data: + + >>> import xarray as xr + >>> from xeofs.xarray import EOF + >>> da = xr.tutorial.load_dataset('rasm')['Tair'] + >>> da = xr.tutorial.load_dataset('air_temperature')['air'] + >>> da = da.isel(lon=slice(0, 3), lat=slice(0, 2)) + + Initialize standardized EOF analysis and compute the first 2 modes: + + >>> model = EOF(da, norm=True, n_modes=2) + >>> model.solve() + + Get explained variance: + + >>> model.explained_variance() + ... xarray.DataArray'explained_variance'mode: 2 + ... array([5.8630486 , 0.12335653], dtype=float32) + ... Coordinates: + ... mode (mode) int64 1 2 + ... Attributes: + ... (0) + + Get EOFs: + + >>> model.eofs() + ... xarray.DataArray 'EOFs' lon: 3 lat: 2 mode: 2 + ... array([[[ 0.4083837 , -0.39021498], + ... [ 0.40758175, 0.42474997]], + ... + ... [[ 0.40875185, -0.40969774], + ... [ 0.40863484, 0.41475943]], + ... + ... [[ 0.40776297, -0.4237887 ], + ... [ 0.40837321, 0.38450614]]]) + ... Coordinates: + ... lat (lat) float32 75.0 72.5 + ... lon (lon) float32 200.0 202.5 205.0 + ... mode (mode) int64 1 2 + ... Attributes: + ... (0) + + Get PCs: + + >>> model.pcs() + ... xarray.DataArray'PCs'time: 2920mode: 2 + ... array([[-3.782707 , -0.07754549], + ... [-3.7966802 , -0.13775176], + ... [-3.7969239 , -0.05770111], + ... ..., + ... [-3.2584608 , 0.3592216 ], + ... [-3.031799 , 0.2055658 ], + ... [-3.0840495 , 0.25031802]], dtype=float32) + ... Coordinates: + ... time (time) datetime64[ns] 2013-01-01 ... 2014-12-31T18:00:00 + ... mode (mode) int64 1 2 + ... Attributes: + ... (0) + + ''' def __init__( self, @@ -25,14 +107,13 @@ def __init__( super().__init__( X=X, - Y=None, n_modes=n_modes, norm=norm ) self._mode_idx = xr.IndexVariable('mode', range(1, self.n_modes + 1)) self._dim = dim - def singular_values(self): + def singular_values(self) -> xr.DataArray: svalues = super().singular_values() return xr.DataArray( svalues, @@ -41,7 +122,7 @@ def singular_values(self): name='singular_values' ) - def explained_variance(self): + def explained_variance(self) -> xr.DataArray: expvar = super().explained_variance() return xr.DataArray( expvar, @@ -50,7 +131,7 @@ def explained_variance(self): name='explained_variance' ) - def explained_variance_ratio(self): + def explained_variance_ratio(self) -> xr.DataArray: expvar = super().explained_variance_ratio() return xr.DataArray( expvar, @@ -59,7 +140,7 @@ def explained_variance_ratio(self): name='explained_variance_ratio' ) - def eofs(self): + def eofs(self) -> xr.DataArray: eofs = self._eofs eofs = self._da_tf.back_transform(eofs.T).T eofs = eofs.rename({self._dim : 'mode'}) @@ -67,7 +148,7 @@ def eofs(self): eofs.name = 'EOFs' return eofs - def pcs(self): + def pcs(self) -> xr.DataArray: pcs = super().pcs() coords = { self._dim : self._da_tf.coords_in[self._dim],