Skip to content

Commit

Permalink
feat: add weight support to EOF classes
Browse files Browse the repository at this point in the history
  • Loading branch information
nicrie committed Feb 18, 2022
1 parent 52b98e6 commit 8821108
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 35 deletions.
64 changes: 53 additions & 11 deletions xeofs/models/_eof_base.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,65 @@
from abc import abstractmethod
from typing import Iterable
from typing import Iterable, Optional

import numpy as np
from sklearn.decomposition import PCA


class _EOF_base():
'''Base class for univariate EOF analysis.
Parameters
----------
X : np.ndarray
2D data matrix ``X`` with samples as rows and features as columns.
NaN entries will raise an error.
n_modes : Optional[int]
Number of modes to be retrieved. If None, then all possible modes will
be computed. Reducing ``n_modes`` can greatly speed up computational
(the default is None).
norm : bool
Normalize each feature by its standard deviation (the default is False).
weights : Optional[np.ndarray]
Apply `weights` to features. Weights must be a 1D array with the same
length as number of features. No NaN entries are allowed
(the default is None).
Attributes
----------
n_samples : int
Number of samples (rows of data matrix).
n_features : int
Number of features (columns of data matrix).
n_modes : int
Number of modes.
'''

def __init__(
self,
X: Iterable[np.ndarray],
n_modes=None,
norm=False
X: np.ndarray,
n_modes: Optional[int] = None,
norm: bool = False,
weights: Optional[np.ndarray] = None
):

# Weights are applied to features, not samples.
if weights is None:
# Use int type to ensure that there won't be rounding errors
# when applying trivial weighting (= all weights equal 1)
weights = np.ones(X.shape[1], dtype=int)

# Standardization is included as weights
if norm:
X /= X.std(axis=0)
stdev = X.std(axis=0)
if (stdev == 0).any():
err_msg = (
'Standard deviation of one ore more features is zero, '
'normalization not possible.'
)
raise ValueError(err_msg)
weights = weights / stdev
X = X * weights

self.n_samples = X.shape[0]
self.n_features = X.shape[1]
Expand All @@ -24,8 +68,6 @@ def __init__(
if n_modes is None:
self.n_modes = min(self.n_samples, self.n_features)

# TODO: weights

self.X = X

def solve(self) -> None:
Expand Down Expand Up @@ -77,7 +119,7 @@ def singular_values(self) -> np.ndarray:

return self._singular_values

def explained_variance(self):
def explained_variance(self) -> np.ndarray:
'''Get the explained variance.
The explained variance is simply given by the individual eigenvalues
Expand All @@ -87,7 +129,7 @@ def explained_variance(self):

return self._explained_variance

def explained_variance_ratio(self):
def explained_variance_ratio(self) -> np.ndarray:
'''Get the explained variance ratio.
The explained variance ratio is the fraction of total variance
Expand All @@ -98,7 +140,7 @@ def explained_variance_ratio(self):

return self._explained_variance_ratio

def eofs(self):
def eofs(self) -> np.ndarray:
'''Get the EOFs.
The empirical orthogonal functions (EOFs) are equivalent to the eigenvectors
Expand All @@ -108,7 +150,7 @@ def eofs(self):

return self._eofs

def pcs(self):
def pcs(self) -> np.ndarray:
'''Get the PCs.
The principal components (PCs), also known as PC scores, are computed
Expand Down
21 changes: 14 additions & 7 deletions xeofs/models/eof.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,21 @@ class EOF(_EOF_base):
Parameters
----------
X : np.ndarray
Data to be decpomposed. ``X`` can be any N-dimensional array with the
first dimension containing the variable whose variance is to be
maximised. All remaining dimensions will be automatically reshaped to
obtain a 2D matrix.
Data matrix to be decomposed. ``X`` can be any N-dimensional array.
Dimensions whose variance shall be maximised (denoted as *samples*)
have to be defined by the ``axis`` parameter. All remaining axes will
be reshaped into a new axis called *features*.
n_modes : Optional[int]
Number of modes to compute. Computing less modes can results in
performance gains. If None, then the maximum number of modes is
equivalent to ``min(n_samples, n_features)`` (the default is None).
norm : bool
Normalize each feature (e.g. grid cell) by its temporal standard
deviation (the default is False).
weights : Optional[np.ndarray] = None
Weights applied to features. Must have the same dimensions as the
original features which are the remaining axes not specified by
``axis`` parameter).
axis : Union[int, Iterable[int]]
Axis along which variance should be maximised. Can also be
multi-dimensional. For example, given a data array of dimensions
Expand Down Expand Up @@ -82,18 +86,21 @@ class EOF(_EOF_base):
def __init__(
self,
X: np.ndarray,
axis : Union[int, Iterable[int]] = 0,
n_modes : Optional[int] = None,
norm : bool = False,
axis : Union[int, Iterable[int]] = 0
weights : Optional[np.ndarray] = None,
):

self._tf = _ArrayTransformer()
X = self._tf.fit_transform(X, axis=axis)

weights = self._tf.transform_weights(weights)

super().__init__(
X=X,
n_modes=n_modes,
norm=norm
norm=norm,
weights=weights
)

def eofs(self):
Expand Down
11 changes: 7 additions & 4 deletions xeofs/pandas/eof.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,15 @@ class EOF(_EOF_base):
----------
X : pd.DataFrame
Data to be decpomposed.
axis : int
Axis along which variance should be maximsed (the default is 0).
n_modes : Optional[int]
Number of modes to compute. Computing less modes can results in
performance gains. If None, then the maximum number of modes is
equivalent to ``min(n_samples, n_features)`` (the default is None).
norm : bool
Normalize each feature (e.g. grid cell) by its temporal standard
deviation (the default is False).
axis : int
Axis along which variance should be maximsed (the default is 0).
Examples
Expand Down Expand Up @@ -71,21 +71,24 @@ class EOF(_EOF_base):
def __init__(
self,
X: pd.DataFrame,
axis : int = 0,
n_modes : Optional[int] = None,
norm : bool = False,
axis : int = 0
weights : Optional[np.ndarray] = None
):

if(np.logical_not(isinstance(X, pd.DataFrame))):
raise ValueError('This interface is for `pandas.DataFrame` only.')

self._tf = _DataFrameTransformer()
X = self._tf.fit_transform(X)
weights = self._tf.transform_weights(weights)

super().__init__(
X=X,
n_modes=n_modes,
norm=norm
norm=norm,
weights=weights
)
self._mode_idx = pd.Index(range(1, self.n_modes + 1), name='mode')

Expand Down
48 changes: 35 additions & 13 deletions xeofs/xarray/eof.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,23 @@ class EOF(_EOF_base):
----------
X : xr.DataArray
Data to be decomposed.
n_modes : Optional[int]
Number of modes to compute. Computing less modes can results in
performance gains. If None, then the maximum number of modes is
equivalent to ``min(n_samples, n_features)`` (the default is None).
norm : bool
Normalize each feature (e.g. grid cell) by its temporal standard
deviation (the default is False).
dim : str
Define the dimension which should considered for maximising variance.
For most applications in climate science, temporal variance is
maximised (also known as S-mode EOF analysis) i.e. the time dimension
should be chosen. If spatial variance should be maximised
(i.e. T-mode EOF analysis), set e.g. ``dim=['lon', 'lat']``
(the default is ``time``).
n_modes : Union[int | None]
Number of modes to compute. Computing less modes can results in
performance gains. If None, then the maximum number of modes is
equivalent to ``min(n_samples, n_features)`` (the default is None).
norm : bool
Normalize each feature (e.g. grid cell) by its temporal standard
deviation (the default is False).
weights : Union[xr.DatArray | str | None]
Weights to be applied to data (features).
Examples
--------
Expand Down Expand Up @@ -96,25 +99,44 @@ class EOF(_EOF_base):
def __init__(
self,
X: xr.DataArray,
dim: Union[str, Iterable[str]] = 'time',
n_modes : Optional[int] = None,
norm : bool = False,
dim: Union[str, Iterable[str]] = 'time'
weights : Optional[Union[xr.DataArray, str]] = None
):

if(np.logical_not(isinstance(X, xr.DataArray))):
raise ValueError('This interface is for `xarray.DataArray` only.')

self._tf = _DataArrayTransformer()
X = self._tf.fit_transform(X, dim=dim)
self._tf.fit(X, dim=dim)
if weights == 'coslat':
weights = self._get_coslat_weights(X)
X = self._tf.transform(X)
weights = self._tf.transform_weights(weights)

super().__init__(
X=X,
n_modes=n_modes,
norm=norm
norm=norm,
weights=weights
)
self._mode_idx = xr.IndexVariable('mode', range(1, self.n_modes + 1))
self._dim = dim

def _get_coslat_weights(self, X : xr.DataArray) -> xr.DataArray:
# Find dimension name of latitude
possible_lat_names = [
'latitude', 'Latitude', 'lat', 'Lat', 'LATITUDE', 'LAT'
]
idx_lat_dim = np.isin(X.dims, possible_lat_names)
lat_dim = np.array(X.dims)[idx_lat_dim][0]
# Compute coslat weights
weights = np.cos(np.deg2rad(X.coords[lat_dim]))
weights = np.sqrt(weights.where(weights > 0, 0))
# Broadcast latitude weights on other feature dimensions
sample_dims = self._tf.dims_samples
feature_grid = X.isel({k: 0 for k in sample_dims})
feature_grid = feature_grid.drop_vars(sample_dims)
return weights.broadcast_like(feature_grid)

def singular_values(self) -> xr.DataArray:
svalues = super().singular_values()
return xr.DataArray(
Expand Down

0 comments on commit 8821108

Please sign in to comment.