Skip to content

Commit

Permalink
docs: update docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
nicrie committed Feb 16, 2022
1 parent 3745308 commit e02b6ec
Show file tree
Hide file tree
Showing 3 changed files with 271 additions and 10 deletions.
129 changes: 126 additions & 3 deletions xeofs/models/eof.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,69 @@


class EOF(_EOF_base):
'''EOF analysis of a single ``np.ndarray``.
Parameters
----------
X : np.ndarray
Data to be decpomposed. ``X`` can be any N-dimensional array with the
first dimension containing the variable whose variance is to be
maximised. All remaining dimensions will be automatically reshaped to
obtain a 2D matrix. For example, given data of `n` time series of ``p1`` different
variables each measured at ``p2`` different locations, the data matrix
``X`` of dimensions ``(n x p1 x p2)`` will maximise `temporal` variance.
In contrast, if provided as ``(p2 x n x p1)``, the `spatial` variance
will be maximised.
n_modes : Optional[int]
Number of modes to compute. Computing less modes can results in
performance gains. If None, then the maximum number of modes is
equivalent to ``min(n_samples, n_features)`` (the default is None).
norm : bool
Normalize each feature (e.g. grid cell) by its temporal standard
deviation (the default is False).
Examples
--------
Import package and create data:
>>> import numpy as np
>>> from xeofs.models import EOF
>>> rng = np.random.default_rng(7)
>>> X = rng.standard_normal((4, 2, 3))
Initialize standardized EOF analysis and compute the first 2 modes:
>>> model = EOF(X, norm=True, n_modes=2)
>>> model.solve()
Get explained variance:
>>> model.explained_variance()
... array([4.81848833, 2.20765019])
Get EOFs:
>>> model.eofs()
... array([[[ 0.52002146, -0.00698575],
... [ 0.41059796, -0.31603563]],
...
... [[ 0.32091783, 0.47647144],
... [-0.51771611, 0.01380736]],
...
... [[-0.28840959, 0.63341346],
... [ 0.32678537, 0.52119516]]])
Get PCs:
>>> model.pcs()
... array([[ 1.76084189, -0.92030205],
... [-2.13581896, -1.49704775],
... [ 2.02091078, 0.65502667],
... [-1.64593371, 1.76232312]])
'''

def __init__(
self,
Expand All @@ -29,12 +92,31 @@ def __init__(

super().__init__(
X=self.X,
Y=None,
n_modes=n_modes,
norm=norm
)

def solve(self):
def solve(self) -> None:
'''
Perform the EOF analysis.
To boost performance, the standard solver is based on
the PCA implementation of scikit-learn [1]_ which uses different algorithms
to perform the decomposition based on the data matrix size.
Naive approaches using singular value decomposition of the
data matrix ``X (n x p)`` or the covariance matrix ``C (p x p)``
quickly become infeasable computationally when the number of
samples :math:`n` or features :math:`p` increase (computational power increases
by :math:`O(n^2p)` and :math:`O(p^3)`, respectively.)
References
----------
.. [1] https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
'''

pca = PCA(n_components=self.n_modes)
self._pcs = pca.fit_transform(self.X)
self._singular_values = pca.singular_values_
Expand All @@ -48,17 +130,58 @@ def solve(self):
self._eofs *= flip_signs
self._pcs *= flip_signs

def singular_values(self):
def singular_values(self) -> np.ndarray:
'''Get the singular values.
The `i` th singular value :math:`\sigma_i` is defined by
.. math::
\sigma_i = \sqrt{n \lambda_i}
where :math:`\lambda_i` and :math:`n` are the associated eigenvalues
and the number of samples, respectively.
'''

return self._singular_values

def explained_variance(self):
'''Get the explained variance.
The explained variance is simply given by the individual eigenvalues
of the covariance matrix.
'''

return self._explained_variance

def explained_variance_ratio(self):
'''Get the explained variance ratio.
The explained variance ratio is the fraction of total variance
explained by a given mode and is calculated by :math:`\lambda_i / \sum_i^m \lambda_i`
where `m` is the total number of modes.
'''

return self._explained_variance_ratio

def eofs(self):
'''Get the EOFs.
The empirical orthogonal functions (EOFs) are equivalent to the eigenvectors
of the covariance matrix of `X`.
'''

return self._arr_tf.back_transform(self._eofs.T).T

def pcs(self):
'''Get the PCs.
The principal components (PCs), also known as PC scores, are computed
by projecting the data matrix `X` onto the eigenvectors.
'''

return self._pcs
59 changes: 58 additions & 1 deletion xeofs/pandas/eof.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,64 @@


class EOF(models.eof.EOF):
'''EOF analysis of a single ``pd.DataFrame``.
Parameters
----------
X : pd.DataFrame
Data to be decpomposed. First dimension contains the variable whose variance is to be
maximised.
n_modes : Optional[int]
Number of modes to compute. Computing less modes can results in
performance gains. If None, then the maximum number of modes is
equivalent to ``min(n_samples, n_features)`` (the default is None).
norm : bool
Normalize each feature (e.g. grid cell) by its temporal standard
deviation (the default is False).
Examples
--------
Import package and create data:
>>> import pandas as pd
>>> from xeofs.pandas import EOF
>>> rng = np.random.default_rng(7)
>>> X = rng.standard_normal((4, 3))
>>> df = pd.DataFrame(X)
Initialize standardized EOF analysis and compute the first 2 modes:
>>> model = EOF(df, norm=True, n_modes=2)
>>> model.solve()
Get explained variance:
>>> model.explained_variance()
... explained_variance
... mode
... 1 2.562701
... 2 1.167054
Get EOFs:
>>> model.eofs()
... mode 1 2
... 0 0.626041 -0.428431
... 1 0.677121 -0.115737
... 2 0.386755 0.896132
Get PCs:
>>> model.pcs()
... mode 1 2
... 0 0.495840 -0.221963
... 1 -2.254508 -0.470420
... 2 1.516900 -0.876695
... 3 0.241768 1.569078
'''

def __init__(
self,
Expand All @@ -24,7 +82,6 @@ def __init__(

super().__init__(
X=X,
Y=None,
n_modes=n_modes,
norm=norm
)
Expand Down
93 changes: 87 additions & 6 deletions xeofs/xarray/eof.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,88 @@


class EOF(models.eof.EOF):
'''EOF analysis of a single ``xr.DataArray``.
Parameters
----------
X : xr.DataArray
Data to be decomposed.
n_modes : Optional[int]
Number of modes to compute. Computing less modes can results in
performance gains. If None, then the maximum number of modes is
equivalent to ``min(n_samples, n_features)`` (the default is None).
norm : bool
Normalize each feature (e.g. grid cell) by its temporal standard
deviation (the default is False).
dim : str
Define the dimension which should considered for maximising variance.
For most applications in climate science, temporal variance is
maximised (also known as S-mode EOF analysis) i.e. the time dimension
should be chosen (the default is ``time``).
Examples
--------
Import package and create data:
>>> import xarray as xr
>>> from xeofs.xarray import EOF
>>> da = xr.tutorial.load_dataset('rasm')['Tair']
>>> da = xr.tutorial.load_dataset('air_temperature')['air']
>>> da = da.isel(lon=slice(0, 3), lat=slice(0, 2))
Initialize standardized EOF analysis and compute the first 2 modes:
>>> model = EOF(da, norm=True, n_modes=2)
>>> model.solve()
Get explained variance:
>>> model.explained_variance()
... xarray.DataArray'explained_variance'mode: 2
... array([5.8630486 , 0.12335653], dtype=float32)
... Coordinates:
... mode (mode) int64 1 2
... Attributes:
... (0)
Get EOFs:
>>> model.eofs()
... xarray.DataArray 'EOFs' lon: 3 lat: 2 mode: 2
... array([[[ 0.4083837 , -0.39021498],
... [ 0.40758175, 0.42474997]],
...
... [[ 0.40875185, -0.40969774],
... [ 0.40863484, 0.41475943]],
...
... [[ 0.40776297, -0.4237887 ],
... [ 0.40837321, 0.38450614]]])
... Coordinates:
... lat (lat) float32 75.0 72.5
... lon (lon) float32 200.0 202.5 205.0
... mode (mode) int64 1 2
... Attributes:
... (0)
Get PCs:
>>> model.pcs()
... xarray.DataArray'PCs'time: 2920mode: 2
... array([[-3.782707 , -0.07754549],
... [-3.7966802 , -0.13775176],
... [-3.7969239 , -0.05770111],
... ...,
... [-3.2584608 , 0.3592216 ],
... [-3.031799 , 0.2055658 ],
... [-3.0840495 , 0.25031802]], dtype=float32)
... Coordinates:
... time (time) datetime64[ns] 2013-01-01 ... 2014-12-31T18:00:00
... mode (mode) int64 1 2
... Attributes:
... (0)
'''

def __init__(
self,
Expand All @@ -25,14 +107,13 @@ def __init__(

super().__init__(
X=X,
Y=None,
n_modes=n_modes,
norm=norm
)
self._mode_idx = xr.IndexVariable('mode', range(1, self.n_modes + 1))
self._dim = dim

def singular_values(self):
def singular_values(self) -> xr.DataArray:
svalues = super().singular_values()
return xr.DataArray(
svalues,
Expand All @@ -41,7 +122,7 @@ def singular_values(self):
name='singular_values'
)

def explained_variance(self):
def explained_variance(self) -> xr.DataArray:
expvar = super().explained_variance()
return xr.DataArray(
expvar,
Expand All @@ -50,7 +131,7 @@ def explained_variance(self):
name='explained_variance'
)

def explained_variance_ratio(self):
def explained_variance_ratio(self) -> xr.DataArray:
expvar = super().explained_variance_ratio()
return xr.DataArray(
expvar,
Expand All @@ -59,15 +140,15 @@ def explained_variance_ratio(self):
name='explained_variance_ratio'
)

def eofs(self):
def eofs(self) -> xr.DataArray:
eofs = self._eofs
eofs = self._da_tf.back_transform(eofs.T).T
eofs = eofs.rename({self._dim : 'mode'})
eofs = eofs.assign_coords({'mode' : self._mode_idx})
eofs.name = 'EOFs'
return eofs

def pcs(self):
def pcs(self) -> xr.DataArray:
pcs = super().pcs()
coords = {
self._dim : self._da_tf.coords_in[self._dim],
Expand Down

0 comments on commit e02b6ec

Please sign in to comment.