feat: add weight support to EOF classes

xarray-contrib · Feb 18, 2022 · 8821108 · 8821108
1 parent 52b98e6
commit 8821108
Show file tree

Hide file tree

Showing 4 changed files with 109 additions and 35 deletions.
diff --git a/xeofs/models/_eof_base.py b/xeofs/models/_eof_base.py
@@ -1,21 +1,65 @@
 from abc import abstractmethod
-from typing import Iterable
+from typing import Iterable, Optional
 
 import numpy as np
 from sklearn.decomposition import PCA
 
 
 class _EOF_base():
+ '''Base class for univariate EOF analysis.
+
+ Parameters
+ ----------
+ X : np.ndarray
+ 2D data matrix ``X`` with samples as rows and features as columns.
+ NaN entries will raise an error.
+ n_modes : Optional[int]
+ Number of modes to be retrieved. If None, then all possible modes will
+ be computed. Reducing ``n_modes`` can greatly speed up computational
+ (the default is None).
+ norm : bool
+ Normalize each feature by its standard deviation (the default is False).
+ weights : Optional[np.ndarray]
+ Apply `weights` to features. Weights must be a 1D array with the same
+ length as number of features. No NaN entries are allowed
+ (the default is None).
+
+ Attributes
+ ----------
+ n_samples : int
+ Number of samples (rows of data matrix).
+ n_features : int
+ Number of features (columns of data matrix).
+ n_modes : int
+ Number of modes.
+
+ '''
 
  def __init__(
  self,
- X: Iterable[np.ndarray],
- n_modes=None,
- norm=False
+ X: np.ndarray,
+ n_modes: Optional[int] = None,
+ norm: bool = False,
+ weights: Optional[np.ndarray] = None
  ):
 
+ # Weights are applied to features, not samples.
+ if weights is None:
+ # Use int type to ensure that there won't be rounding errors
+ # when applying trivial weighting (= all weights equal 1)
+ weights = np.ones(X.shape[1], dtype=int)
+
+ # Standardization is included as weights
  if norm:
- X /= X.std(axis=0)
+ stdev = X.std(axis=0)
+ if (stdev == 0).any():
+ err_msg = (
+ 'Standard deviation of one ore more features is zero, '
+ 'normalization not possible.'
+ )
+ raise ValueError(err_msg)
+ weights = weights / stdev
+ X = X * weights
 
  self.n_samples = X.shape[0]
  self.n_features = X.shape[1]
@@ -24,8 +68,6 @@ def __init__(
  if n_modes is None:
  self.n_modes = min(self.n_samples, self.n_features)
 
- # TODO: weights
-
  self.X = X
 
  def solve(self) -> None:
@@ -77,7 +119,7 @@ def singular_values(self) -> np.ndarray:
 
  return self._singular_values
 
- def explained_variance(self):
+ def explained_variance(self) -> np.ndarray:
  '''Get the explained variance.
 
  The explained variance is simply given by the individual eigenvalues
@@ -87,7 +129,7 @@ def explained_variance(self):
 
  return self._explained_variance
 
- def explained_variance_ratio(self):
+ def explained_variance_ratio(self) -> np.ndarray:
  '''Get the explained variance ratio.
 
  The explained variance ratio is the fraction of total variance
@@ -98,7 +140,7 @@ def explained_variance_ratio(self):
 
  return self._explained_variance_ratio
 
- def eofs(self):
+ def eofs(self) -> np.ndarray:
  '''Get the EOFs.
 
  The empirical orthogonal functions (EOFs) are equivalent to the eigenvectors
@@ -108,7 +150,7 @@ def eofs(self):
 
  return self._eofs
 
- def pcs(self):
+ def pcs(self) -> np.ndarray:
  '''Get the PCs.
 
  The principal components (PCs), also known as PC scores, are computed

diff --git a/xeofs/models/eof.py b/xeofs/models/eof.py
@@ -17,17 +17,21 @@ class EOF(_EOF_base):
  Parameters
  ----------
  X : np.ndarray
- Data to be decpomposed. ``X`` can be any N-dimensional array with the
- first dimension containing the variable whose variance is to be
- maximised. All remaining dimensions will be automatically reshaped to
- obtain a 2D matrix.
+ Data matrix to be decomposed. ``X`` can be any N-dimensional array.
+ Dimensions whose variance shall be maximised (denoted as *samples*)
+ have to be defined by the ``axis`` parameter. All remaining axes will
+ be reshaped into a new axis called *features*.
  n_modes : Optional[int]
  Number of modes to compute. Computing less modes can results in
  performance gains. If None, then the maximum number of modes is
  equivalent to ``min(n_samples, n_features)`` (the default is None).
  norm : bool
  Normalize each feature (e.g. grid cell) by its temporal standard
  deviation (the default is False).
+ weights : Optional[np.ndarray] = None
+ Weights applied to features. Must have the same dimensions as the
+ original features which are the remaining axes not specified by
+ ``axis`` parameter).
  axis : Union[int, Iterable[int]]
  Axis along which variance should be maximised. Can also be
  multi-dimensional. For example, given a data array of dimensions
@@ -82,18 +86,21 @@ class EOF(_EOF_base):
  def __init__(
  self,
  X: np.ndarray,
+ axis : Union[int, Iterable[int]] = 0,
  n_modes : Optional[int] = None,
  norm : bool = False,
- axis : Union[int, Iterable[int]] = 0
+ weights : Optional[np.ndarray] = None,
  ):
 
  self._tf = _ArrayTransformer()
  X = self._tf.fit_transform(X, axis=axis)
-
+ weights = self._tf.transform_weights(weights)
+
  super().__init__(
  X=X,
  n_modes=n_modes,
- norm=norm
+ norm=norm,
+ weights=weights
  )
 
  def eofs(self):

diff --git a/xeofs/pandas/eof.py b/xeofs/pandas/eof.py
@@ -14,15 +14,15 @@ class EOF(_EOF_base):
  ----------
  X : pd.DataFrame
  Data to be decpomposed.
+ axis : int
+ Axis along which variance should be maximsed (the default is 0).
  n_modes : Optional[int]
  Number of modes to compute. Computing less modes can results in
  performance gains. If None, then the maximum number of modes is
  equivalent to ``min(n_samples, n_features)`` (the default is None).
  norm : bool
  Normalize each feature (e.g. grid cell) by its temporal standard
  deviation (the default is False).
- axis : int
- Axis along which variance should be maximsed (the default is 0).
 
 
  Examples
@@ -71,21 +71,24 @@ class EOF(_EOF_base):
  def __init__(
  self,
  X: pd.DataFrame,
+ axis : int = 0,
  n_modes : Optional[int] = None,
  norm : bool = False,
- axis : int = 0
+ weights : Optional[np.ndarray] = None
  ):
 
  if(np.logical_not(isinstance(X, pd.DataFrame))):
  raise ValueError('This interface is for `pandas.DataFrame` only.')
 
  self._tf = _DataFrameTransformer()
  X = self._tf.fit_transform(X)
+ weights = self._tf.transform_weights(weights)
 
  super().__init__(
  X=X,
  n_modes=n_modes,
- norm=norm
+ norm=norm,
+ weights=weights
  )
  self._mode_idx = pd.Index(range(1, self.n_modes + 1), name='mode')
 

diff --git a/xeofs/xarray/eof.py b/xeofs/xarray/eof.py
@@ -14,20 +14,23 @@ class EOF(_EOF_base):
  ----------
  X : xr.DataArray
  Data to be decomposed.
- n_modes : Optional[int]
- Number of modes to compute. Computing less modes can results in
- performance gains. If None, then the maximum number of modes is
- equivalent to ``min(n_samples, n_features)`` (the default is None).
- norm : bool
- Normalize each feature (e.g. grid cell) by its temporal standard
- deviation (the default is False).
  dim : str
  Define the dimension which should considered for maximising variance.
  For most applications in climate science, temporal variance is
  maximised (also known as S-mode EOF analysis) i.e. the time dimension
  should be chosen. If spatial variance should be maximised
  (i.e. T-mode EOF analysis), set e.g. ``dim=['lon', 'lat']``
  (the default is ``time``).
+ n_modes : Union[int | None]
+ Number of modes to compute. Computing less modes can results in
+ performance gains. If None, then the maximum number of modes is
+ equivalent to ``min(n_samples, n_features)`` (the default is None).
+ norm : bool
+ Normalize each feature (e.g. grid cell) by its temporal standard
+ deviation (the default is False).
+ weights : Union[xr.DatArray | str | None]
+ Weights to be applied to data (features).
+
 
  Examples
  --------
@@ -96,25 +99,44 @@ class EOF(_EOF_base):
  def __init__(
  self,
  X: xr.DataArray,
+ dim: Union[str, Iterable[str]] = 'time',
  n_modes : Optional[int] = None,
  norm : bool = False,
- dim: Union[str, Iterable[str]] = 'time'
+ weights : Optional[Union[xr.DataArray, str]] = None
  ):
 
- if(np.logical_not(isinstance(X, xr.DataArray))):
- raise ValueError('This interface is for `xarray.DataArray` only.')
-
  self._tf = _DataArrayTransformer()
- X = self._tf.fit_transform(X, dim=dim)
+ self._tf.fit(X, dim=dim)
+ if weights == 'coslat':
+ weights = self._get_coslat_weights(X)
+ X = self._tf.transform(X)
+ weights = self._tf.transform_weights(weights)
 
  super().__init__(
  X=X,
  n_modes=n_modes,
- norm=norm
+ norm=norm,
+ weights=weights
  )
  self._mode_idx = xr.IndexVariable('mode', range(1, self.n_modes + 1))
  self._dim = dim
 
+ def _get_coslat_weights(self, X : xr.DataArray) -> xr.DataArray:
+ # Find dimension name of latitude
+ possible_lat_names = [
+ 'latitude', 'Latitude', 'lat', 'Lat', 'LATITUDE', 'LAT'
+ ]
+ idx_lat_dim = np.isin(X.dims, possible_lat_names)
+ lat_dim = np.array(X.dims)[idx_lat_dim][0]
+ # Compute coslat weights
+ weights = np.cos(np.deg2rad(X.coords[lat_dim]))
+ weights = np.sqrt(weights.where(weights > 0, 0))
+ # Broadcast latitude weights on other feature dimensions
+ sample_dims = self._tf.dims_samples
+ feature_grid = X.isel({k: 0 for k in sample_dims})
+ feature_grid = feature_grid.drop_vars(sample_dims)
+ return weights.broadcast_like(feature_grid)
+
  def singular_values(self) -> xr.DataArray:
  svalues = super().singular_values()
  return xr.DataArray(