diff --git a/xeofs/models/_eof_base.py b/xeofs/models/_eof_base.py index 5ef109a..8e909f2 100644 --- a/xeofs/models/_eof_base.py +++ b/xeofs/models/_eof_base.py @@ -1,21 +1,65 @@ from abc import abstractmethod -from typing import Iterable +from typing import Iterable, Optional import numpy as np from sklearn.decomposition import PCA class _EOF_base(): + '''Base class for univariate EOF analysis. + + Parameters + ---------- + X : np.ndarray + 2D data matrix ``X`` with samples as rows and features as columns. + NaN entries will raise an error. + n_modes : Optional[int] + Number of modes to be retrieved. If None, then all possible modes will + be computed. Reducing ``n_modes`` can greatly speed up computational + (the default is None). + norm : bool + Normalize each feature by its standard deviation (the default is False). + weights : Optional[np.ndarray] + Apply `weights` to features. Weights must be a 1D array with the same + length as number of features. No NaN entries are allowed + (the default is None). + + Attributes + ---------- + n_samples : int + Number of samples (rows of data matrix). + n_features : int + Number of features (columns of data matrix). + n_modes : int + Number of modes. + + ''' def __init__( self, - X: Iterable[np.ndarray], - n_modes=None, - norm=False + X: np.ndarray, + n_modes: Optional[int] = None, + norm: bool = False, + weights: Optional[np.ndarray] = None ): + # Weights are applied to features, not samples. + if weights is None: + # Use int type to ensure that there won't be rounding errors + # when applying trivial weighting (= all weights equal 1) + weights = np.ones(X.shape[1], dtype=int) + + # Standardization is included as weights if norm: - X /= X.std(axis=0) + stdev = X.std(axis=0) + if (stdev == 0).any(): + err_msg = ( + 'Standard deviation of one ore more features is zero, ' + 'normalization not possible.' + ) + raise ValueError(err_msg) + weights = weights / stdev + X = X * weights self.n_samples = X.shape[0] self.n_features = X.shape[1] @@ -24,8 +68,6 @@ def __init__( if n_modes is None: self.n_modes = min(self.n_samples, self.n_features) - # TODO: weights - self.X = X def solve(self) -> None: @@ -77,7 +119,7 @@ def singular_values(self) -> np.ndarray: return self._singular_values - def explained_variance(self): + def explained_variance(self) -> np.ndarray: '''Get the explained variance. The explained variance is simply given by the individual eigenvalues @@ -87,7 +129,7 @@ def explained_variance(self): return self._explained_variance - def explained_variance_ratio(self): + def explained_variance_ratio(self) -> np.ndarray: '''Get the explained variance ratio. The explained variance ratio is the fraction of total variance @@ -98,7 +140,7 @@ def explained_variance_ratio(self): return self._explained_variance_ratio - def eofs(self): + def eofs(self) -> np.ndarray: '''Get the EOFs. The empirical orthogonal functions (EOFs) are equivalent to the eigenvectors @@ -108,7 +150,7 @@ def eofs(self): return self._eofs - def pcs(self): + def pcs(self) -> np.ndarray: '''Get the PCs. The principal components (PCs), also known as PC scores, are computed diff --git a/xeofs/models/eof.py b/xeofs/models/eof.py index 64fefd8..40bccb5 100644 --- a/xeofs/models/eof.py +++ b/xeofs/models/eof.py @@ -17,10 +17,10 @@ class EOF(_EOF_base): Parameters ---------- X : np.ndarray - Data to be decpomposed. ``X`` can be any N-dimensional array with the - first dimension containing the variable whose variance is to be - maximised. All remaining dimensions will be automatically reshaped to - obtain a 2D matrix. + Data matrix to be decomposed. ``X`` can be any N-dimensional array. + Dimensions whose variance shall be maximised (denoted as *samples*) + have to be defined by the ``axis`` parameter. All remaining axes will + be reshaped into a new axis called *features*. n_modes : Optional[int] Number of modes to compute. Computing less modes can results in performance gains. If None, then the maximum number of modes is @@ -28,6 +28,10 @@ class EOF(_EOF_base): norm : bool Normalize each feature (e.g. grid cell) by its temporal standard deviation (the default is False). + weights : Optional[np.ndarray] = None + Weights applied to features. Must have the same dimensions as the + original features which are the remaining axes not specified by + ``axis`` parameter). axis : Union[int, Iterable[int]] Axis along which variance should be maximised. Can also be multi-dimensional. For example, given a data array of dimensions @@ -82,18 +86,21 @@ class EOF(_EOF_base): def __init__( self, X: np.ndarray, + axis : Union[int, Iterable[int]] = 0, n_modes : Optional[int] = None, norm : bool = False, - axis : Union[int, Iterable[int]] = 0 + weights : Optional[np.ndarray] = None, ): self._tf = _ArrayTransformer() X = self._tf.fit_transform(X, axis=axis) - + weights = self._tf.transform_weights(weights) + super().__init__( X=X, n_modes=n_modes, - norm=norm + norm=norm, + weights=weights ) def eofs(self): diff --git a/xeofs/pandas/eof.py b/xeofs/pandas/eof.py index bd86ad9..a47bc62 100644 --- a/xeofs/pandas/eof.py +++ b/xeofs/pandas/eof.py @@ -14,6 +14,8 @@ class EOF(_EOF_base): ---------- X : pd.DataFrame Data to be decpomposed. + axis : int + Axis along which variance should be maximsed (the default is 0). n_modes : Optional[int] Number of modes to compute. Computing less modes can results in performance gains. If None, then the maximum number of modes is @@ -21,8 +23,6 @@ class EOF(_EOF_base): norm : bool Normalize each feature (e.g. grid cell) by its temporal standard deviation (the default is False). - axis : int - Axis along which variance should be maximsed (the default is 0). Examples @@ -71,9 +71,10 @@ class EOF(_EOF_base): def __init__( self, X: pd.DataFrame, + axis : int = 0, n_modes : Optional[int] = None, norm : bool = False, - axis : int = 0 + weights : Optional[np.ndarray] = None ): if(np.logical_not(isinstance(X, pd.DataFrame))): @@ -81,11 +82,13 @@ def __init__( self._tf = _DataFrameTransformer() X = self._tf.fit_transform(X) + weights = self._tf.transform_weights(weights) super().__init__( X=X, n_modes=n_modes, - norm=norm + norm=norm, + weights=weights ) self._mode_idx = pd.Index(range(1, self.n_modes + 1), name='mode') diff --git a/xeofs/xarray/eof.py b/xeofs/xarray/eof.py index 88fbc7f..db71c1a 100644 --- a/xeofs/xarray/eof.py +++ b/xeofs/xarray/eof.py @@ -14,13 +14,6 @@ class EOF(_EOF_base): ---------- X : xr.DataArray Data to be decomposed. - n_modes : Optional[int] - Number of modes to compute. Computing less modes can results in - performance gains. If None, then the maximum number of modes is - equivalent to ``min(n_samples, n_features)`` (the default is None). - norm : bool - Normalize each feature (e.g. grid cell) by its temporal standard - deviation (the default is False). dim : str Define the dimension which should considered for maximising variance. For most applications in climate science, temporal variance is @@ -28,6 +21,16 @@ class EOF(_EOF_base): should be chosen. If spatial variance should be maximised (i.e. T-mode EOF analysis), set e.g. ``dim=['lon', 'lat']`` (the default is ``time``). + n_modes : Union[int | None] + Number of modes to compute. Computing less modes can results in + performance gains. If None, then the maximum number of modes is + equivalent to ``min(n_samples, n_features)`` (the default is None). + norm : bool + Normalize each feature (e.g. grid cell) by its temporal standard + deviation (the default is False). + weights : Union[xr.DatArray | str | None] + Weights to be applied to data (features). + Examples -------- @@ -96,25 +99,44 @@ class EOF(_EOF_base): def __init__( self, X: xr.DataArray, + dim: Union[str, Iterable[str]] = 'time', n_modes : Optional[int] = None, norm : bool = False, - dim: Union[str, Iterable[str]] = 'time' + weights : Optional[Union[xr.DataArray, str]] = None ): - if(np.logical_not(isinstance(X, xr.DataArray))): - raise ValueError('This interface is for `xarray.DataArray` only.') - self._tf = _DataArrayTransformer() - X = self._tf.fit_transform(X, dim=dim) + self._tf.fit(X, dim=dim) + if weights == 'coslat': + weights = self._get_coslat_weights(X) + X = self._tf.transform(X) + weights = self._tf.transform_weights(weights) super().__init__( X=X, n_modes=n_modes, - norm=norm + norm=norm, + weights=weights ) self._mode_idx = xr.IndexVariable('mode', range(1, self.n_modes + 1)) self._dim = dim + def _get_coslat_weights(self, X : xr.DataArray) -> xr.DataArray: + # Find dimension name of latitude + possible_lat_names = [ + 'latitude', 'Latitude', 'lat', 'Lat', 'LATITUDE', 'LAT' + ] + idx_lat_dim = np.isin(X.dims, possible_lat_names) + lat_dim = np.array(X.dims)[idx_lat_dim][0] + # Compute coslat weights + weights = np.cos(np.deg2rad(X.coords[lat_dim])) + weights = np.sqrt(weights.where(weights > 0, 0)) + # Broadcast latitude weights on other feature dimensions + sample_dims = self._tf.dims_samples + feature_grid = X.isel({k: 0 for k in sample_dims}) + feature_grid = feature_grid.drop_vars(sample_dims) + return weights.broadcast_like(feature_grid) + def singular_values(self) -> xr.DataArray: svalues = super().singular_values() return xr.DataArray(