diff --git a/README.md b/README.md index 9c1015c..0244877 100644 --- a/README.md +++ b/README.md @@ -136,6 +136,7 @@ For questions or support, please open a [Github issue](https://github.com/xarray - MCA: Python package [xMCA](https://github.com/Yefee/xMCA) by Yefee - CCA: Python package [CCA-Zoo](https://github.com/jameschapman19/cca_zoo) by James Chapman - ROCK-PCA: Matlab implementation by [Diego Bueso](https://github.com/DiegoBueso/ROCK-PCA) +- Sparse PCA: Based on [Ristretto](https://github.com/erichson/ristretto) library by Benjamin Erichson ## How to cite? diff --git a/docs/_autosummary/xeofs.models.SparsePCA.rst b/docs/_autosummary/xeofs.models.SparsePCA.rst new file mode 100644 index 0000000..068ccd8 --- /dev/null +++ b/docs/_autosummary/xeofs.models.SparsePCA.rst @@ -0,0 +1,40 @@ +xeofs.models.SparsePCA +====================== + +.. currentmodule:: xeofs.models + +.. autoclass:: SparsePCA + :members: + :show-inheritance: + :inherited-members: + + + .. automethod:: __init__ + + + .. rubric:: Methods + + .. autosummary:: + + ~SparsePCA.__init__ + ~SparsePCA.components + ~SparsePCA.compute + ~SparsePCA.deserialize + ~SparsePCA.explained_variance + ~SparsePCA.explained_variance_ratio + ~SparsePCA.fit + ~SparsePCA.fit_transform + ~SparsePCA.get_params + ~SparsePCA.get_serialization_attrs + ~SparsePCA.inverse_transform + ~SparsePCA.load + ~SparsePCA.save + ~SparsePCA.scores + ~SparsePCA.serialize + ~SparsePCA.transform + + + + + + \ No newline at end of file diff --git a/docs/api_1_intra.rst b/docs/api_1_intra.rst index f78f553..a8c00c3 100644 --- a/docs/api_1_intra.rst +++ b/docs/api_1_intra.rst @@ -10,5 +10,6 @@ Single-Set Analysis xeofs.models.ComplexEOF xeofs.models.ComplexEOFRotator xeofs.models.ExtendedEOF + xeofs.models.SparsePCA xeofs.models.OPA xeofs.models.GWPCA diff --git a/docs/auto_examples/1single/images/sphx_glr_plot_eof-smode_001.png b/docs/auto_examples/1single/images/sphx_glr_plot_eof-smode_001.png index 8edbe22..49615fd 100644 Binary files a/docs/auto_examples/1single/images/sphx_glr_plot_eof-smode_001.png and b/docs/auto_examples/1single/images/sphx_glr_plot_eof-smode_001.png differ diff --git a/docs/auto_examples/1single/images/thumb/sphx_glr_plot_eof-smode_thumb.png b/docs/auto_examples/1single/images/thumb/sphx_glr_plot_eof-smode_thumb.png index 8f5a268..081c54c 100644 Binary files a/docs/auto_examples/1single/images/thumb/sphx_glr_plot_eof-smode_thumb.png and b/docs/auto_examples/1single/images/thumb/sphx_glr_plot_eof-smode_thumb.png differ diff --git a/docs/auto_examples/1single/index.rst b/docs/auto_examples/1single/index.rst index 1e33d6f..246366b 100644 --- a/docs/auto_examples/1single/index.rst +++ b/docs/auto_examples/1single/index.rst @@ -65,7 +65,7 @@ .. raw:: html -
+
.. only:: html @@ -76,7 +76,7 @@ .. raw:: html -
EOF analysis (S-mode)
+
Sparse PCA
diff --git a/docs/auto_examples/1single/plot_eof-smode.ipynb b/docs/auto_examples/1single/plot_eof-smode.ipynb index dbc977c..97441b1 100644 --- a/docs/auto_examples/1single/plot_eof-smode.ipynb +++ b/docs/auto_examples/1single/plot_eof-smode.ipynb @@ -5,9 +5,14 @@ "metadata": {}, "source": [ "\n", - "# EOF analysis (S-mode)\n", + "# Sparse PCA\n", "\n", - "EOF analysis in S-mode maximises the temporal variance.\n" + "This example demonstrates the application of sparse PCA [1]_ to sea surface temperature data. Sparse PCA is an alternative to rotated PCA, where the components are sparse, often providing a more interpretable solution.\n", + "\n", + "We replicate the analysis from the original paper [1]_, which identifies the ENSO (El Niño-Southern Oscillation) as the fourth mode, representing about 1% of the total variance. The original study focused on weekly sea surface temperatures from satellite data, whereas we use monthly data from ERSSTv5. Consequently, our results may not match exactly, but they should be quite similar.\n", + "\n", + "## References\n", + ".. [1] Erichson, N. B. et al. Sparse Principal Component Analysis via Variable Projection. SIAM J. Appl. Math. 80, 977-1002 (2020).\n" ] }, { @@ -17,12 +22,20 @@ "outputs": [], "source": [ "# Load packages and data:\n", - "import xarray as xr\n", "import matplotlib.pyplot as plt\n", - "from matplotlib.gridspec import GridSpec\n", + "import xarray as xr\n", "from cartopy.crs import EqualEarth, PlateCarree\n", + "from matplotlib.gridspec import GridSpec\n", "\n", - "from xeofs.models import EOF" + "from xeofs.models import SparsePCA" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We use sea surface temperature data from 1990 to 2017, consistent with the original paper.\n", + "\n" ] }, { @@ -31,14 +44,15 @@ "metadata": {}, "outputs": [], "source": [ - "sst = xr.tutorial.open_dataset(\"ersstv5\")[\"sst\"]" + "sst = xr.tutorial.open_dataset(\"ersstv5\")[\"sst\"]\n", + "sst = sst.sel(time=slice(\"1990\", \"2017\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Perform the actual analysis\n", + "We perform sparse PCA using the `alpha` and `beta` parameters, which define the sparsity imposed by the elastic net (refer to Table 1 in the paper). In our analysis, we set `alpha` to 1e-5, as specified by the authors. Although the authors do not specify a value for `beta`, it appears that the results are not highly sensitive to this parameter. Therefore, we use the default `beta` value of 1e-4.\n", "\n" ] }, @@ -48,7 +62,7 @@ "metadata": {}, "outputs": [], "source": [ - "model = EOF(n_modes=5, use_coslat=True)\n", + "model = SparsePCA(n_modes=4, alpha=1e-5)\n", "model.fit(sst, dim=\"time\")\n", "expvar = model.explained_variance()\n", "expvar_ratio = model.explained_variance_ratio()\n", @@ -60,7 +74,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Explained variance fraction\n", + "The explained variance fraction confirms that the fourth mode explains about 1% of the total variance, which is consistent with the original paper.\n", "\n" ] }, @@ -78,7 +92,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Create figure showing the first two modes\n", + "Examining the first four modes, we clearly identify ENSO as the fourth mode.\n", "\n" ] }, @@ -91,10 +105,10 @@ "proj = EqualEarth(central_longitude=180)\n", "kwargs = {\"cmap\": \"RdBu\", \"vmin\": -0.05, \"vmax\": 0.05, \"transform\": PlateCarree()}\n", "\n", - "fig = plt.figure(figsize=(10, 8))\n", - "gs = GridSpec(3, 2, width_ratios=[1, 2])\n", - "ax0 = [fig.add_subplot(gs[i, 0]) for i in range(3)]\n", - "ax1 = [fig.add_subplot(gs[i, 1], projection=proj) for i in range(3)]\n", + "fig = plt.figure(figsize=(10, 12))\n", + "gs = GridSpec(4, 2, width_ratios=[1, 2])\n", + "ax0 = [fig.add_subplot(gs[i, 0]) for i in range(4)]\n", + "ax1 = [fig.add_subplot(gs[i, 1], projection=proj) for i in range(4)]\n", "\n", "for i, (a0, a1) in enumerate(zip(ax0, ax1)):\n", " scores.sel(mode=i + 1).plot(ax=a0)\n", @@ -104,7 +118,7 @@ " a0.set_xlabel(\"\")\n", "\n", "plt.tight_layout()\n", - "plt.savefig(\"eof-smode.jpg\")" + "plt.savefig(\"sparse_pca.jpg\")" ] } ], @@ -124,7 +138,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.8" + "version": "3.11.4" } }, "nbformat": 4, diff --git a/docs/auto_examples/1single/plot_eof-smode.py b/docs/auto_examples/1single/plot_eof-smode.py index f9cd5f3..a81d00c 100644 --- a/docs/auto_examples/1single/plot_eof-smode.py +++ b/docs/auto_examples/1single/plot_eof-smode.py @@ -1,26 +1,35 @@ """ -EOF analysis (S-mode) +Sparse PCA ======================== -EOF analysis in S-mode maximises the temporal variance. +This example demonstrates the application of sparse PCA [1]_ to sea surface temperature data. Sparse PCA is an alternative to rotated PCA, where the components are sparse, often providing a more interpretable solution. + +We replicate the analysis from the original paper [1]_, which identifies the ENSO (El Niño-Southern Oscillation) as the fourth mode, representing about 1% of the total variance. The original study focused on weekly sea surface temperatures from satellite data, whereas we use monthly data from ERSSTv5. Consequently, our results may not match exactly, but they should be quite similar. + +References +---------- +.. [1] Erichson, N. B. et al. Sparse Principal Component Analysis via Variable Projection. SIAM J. Appl. Math. 80, 977-1002 (2020). + """ # Load packages and data: -import xarray as xr import matplotlib.pyplot as plt -from matplotlib.gridspec import GridSpec +import xarray as xr from cartopy.crs import EqualEarth, PlateCarree +from matplotlib.gridspec import GridSpec -from xeofs.models import EOF +from xeofs.models import SparsePCA # %% +# We use sea surface temperature data from 1990 to 2017, consistent with the original paper. sst = xr.tutorial.open_dataset("ersstv5")["sst"] +sst = sst.sel(time=slice("1990", "2017")) # %% -# Perform the actual analysis +# We perform sparse PCA using the `alpha` and `beta` parameters, which define the sparsity imposed by the elastic net (refer to Table 1 in the paper). In our analysis, we set `alpha` to 1e-5, as specified by the authors. Although the authors do not specify a value for `beta`, it appears that the results are not highly sensitive to this parameter. Therefore, we use the default `beta` value of 1e-4. -model = EOF(n_modes=5, use_coslat=True) +model = SparsePCA(n_modes=4, alpha=1e-5) model.fit(sst, dim="time") expvar = model.explained_variance() expvar_ratio = model.explained_variance_ratio() @@ -28,21 +37,21 @@ scores = model.scores() # %% -# Explained variance fraction +# The explained variance fraction confirms that the fourth mode explains about 1% of the total variance, which is consistent with the original paper. print("Explained variance: ", expvar.round(0).values) print("Relative: ", (expvar_ratio * 100).round(1).values) # %% -# Create figure showing the first two modes +# Examining the first four modes, we clearly identify ENSO as the fourth mode. proj = EqualEarth(central_longitude=180) kwargs = {"cmap": "RdBu", "vmin": -0.05, "vmax": 0.05, "transform": PlateCarree()} -fig = plt.figure(figsize=(10, 8)) -gs = GridSpec(3, 2, width_ratios=[1, 2]) -ax0 = [fig.add_subplot(gs[i, 0]) for i in range(3)] -ax1 = [fig.add_subplot(gs[i, 1], projection=proj) for i in range(3)] +fig = plt.figure(figsize=(10, 12)) +gs = GridSpec(4, 2, width_ratios=[1, 2]) +ax0 = [fig.add_subplot(gs[i, 0]) for i in range(4)] +ax1 = [fig.add_subplot(gs[i, 1], projection=proj) for i in range(4)] for i, (a0, a1) in enumerate(zip(ax0, ax1)): scores.sel(mode=i + 1).plot(ax=a0) @@ -52,4 +61,6 @@ a0.set_xlabel("") plt.tight_layout() -plt.savefig("eof-smode.jpg") +plt.savefig("sparse_pca.jpg") + +# %% diff --git a/docs/auto_examples/1single/plot_eof-smode.py.md5 b/docs/auto_examples/1single/plot_eof-smode.py.md5 index 62a7492..5957648 100644 --- a/docs/auto_examples/1single/plot_eof-smode.py.md5 +++ b/docs/auto_examples/1single/plot_eof-smode.py.md5 @@ -1 +1 @@ -9dcc843908baf836161fbc9c490b6136 \ No newline at end of file +bb0b6c390528787d00735dd586afb457 \ No newline at end of file diff --git a/docs/auto_examples/1single/plot_eof-smode.rst b/docs/auto_examples/1single/plot_eof-smode.rst index cb87820..1f930ac 100644 --- a/docs/auto_examples/1single/plot_eof-smode.rst +++ b/docs/auto_examples/1single/plot_eof-smode.rst @@ -18,37 +18,48 @@ .. _sphx_glr_auto_examples_1single_plot_eof-smode.py: -EOF analysis (S-mode) +Sparse PCA ======================== -EOF analysis in S-mode maximises the temporal variance. +This example demonstrates the application of sparse PCA [1]_ to sea surface temperature data. Sparse PCA is an alternative to rotated PCA, where the components are sparse, often providing a more interpretable solution. -.. GENERATED FROM PYTHON SOURCE LINES 7-16 +We replicate the analysis from the original paper [1]_, which identifies the ENSO (El Niño-Southern Oscillation) as the fourth mode, representing about 1% of the total variance. The original study focused on weekly sea surface temperatures from satellite data, whereas we use monthly data from ERSSTv5. Consequently, our results may not match exactly, but they should be quite similar. -.. code-block:: Python +References +---------- +.. [1] Erichson, N. B. et al. Sparse Principal Component Analysis via Variable Projection. SIAM J. Appl. Math. 80, 977-1002 (2020). + +.. GENERATED FROM PYTHON SOURCE LINES 14-23 + +.. code-block:: default # Load packages and data: - import xarray as xr import matplotlib.pyplot as plt - from matplotlib.gridspec import GridSpec + import xarray as xr from cartopy.crs import EqualEarth, PlateCarree + from matplotlib.gridspec import GridSpec + + from xeofs.models import SparsePCA + - from xeofs.models import EOF +.. GENERATED FROM PYTHON SOURCE LINES 24-25 +We use sea surface temperature data from 1990 to 2017, consistent with the original paper. -.. GENERATED FROM PYTHON SOURCE LINES 17-20 +.. GENERATED FROM PYTHON SOURCE LINES 25-29 -.. code-block:: Python +.. code-block:: default sst = xr.tutorial.open_dataset("ersstv5")["sst"] + sst = sst.sel(time=slice("1990", "2017")) @@ -57,16 +68,16 @@ EOF analysis in S-mode maximises the temporal variance. -.. GENERATED FROM PYTHON SOURCE LINES 21-22 +.. GENERATED FROM PYTHON SOURCE LINES 30-31 -Perform the actual analysis +We perform sparse PCA using the `alpha` and `beta` parameters, which define the sparsity imposed by the elastic net (refer to Table 1 in the paper). In our analysis, we set `alpha` to 1e-5, as specified by the authors. Although the authors do not specify a value for `beta`, it appears that the results are not highly sensitive to this parameter. Therefore, we use the default `beta` value of 1e-4. -.. GENERATED FROM PYTHON SOURCE LINES 22-30 +.. GENERATED FROM PYTHON SOURCE LINES 31-39 -.. code-block:: Python +.. code-block:: default - model = EOF(n_modes=5, use_coslat=True) + model = SparsePCA(n_modes=4, alpha=1e-5) model.fit(sst, dim="time") expvar = model.explained_variance() expvar_ratio = model.explained_variance_ratio() @@ -80,13 +91,13 @@ Perform the actual analysis -.. GENERATED FROM PYTHON SOURCE LINES 31-32 +.. GENERATED FROM PYTHON SOURCE LINES 40-41 -Explained variance fraction +The explained variance fraction confirms that the fourth mode explains about 1% of the total variance, which is consistent with the original paper. -.. GENERATED FROM PYTHON SOURCE LINES 32-36 +.. GENERATED FROM PYTHON SOURCE LINES 41-45 -.. code-block:: Python +.. code-block:: default print("Explained variance: ", expvar.round(0).values) @@ -100,28 +111,28 @@ Explained variance fraction .. code-block:: none - Explained variance: [24398. 1066. 676. 407. 303.] - Relative: [85.5 3.7 2.4 1.4 1.1] + Explained variance: [34060. 1252. 963. 405.] + Relative: [86. 3.2 2.4 1. ] -.. GENERATED FROM PYTHON SOURCE LINES 37-38 +.. GENERATED FROM PYTHON SOURCE LINES 46-47 -Create figure showing the first two modes +Examining the first four modes, we clearly identify ENSO as the fourth mode. -.. GENERATED FROM PYTHON SOURCE LINES 38-56 +.. GENERATED FROM PYTHON SOURCE LINES 47-66 -.. code-block:: Python +.. code-block:: default proj = EqualEarth(central_longitude=180) kwargs = {"cmap": "RdBu", "vmin": -0.05, "vmax": 0.05, "transform": PlateCarree()} - fig = plt.figure(figsize=(10, 8)) - gs = GridSpec(3, 2, width_ratios=[1, 2]) - ax0 = [fig.add_subplot(gs[i, 0]) for i in range(3)] - ax1 = [fig.add_subplot(gs[i, 1], projection=proj) for i in range(3)] + fig = plt.figure(figsize=(10, 12)) + gs = GridSpec(4, 2, width_ratios=[1, 2]) + ax0 = [fig.add_subplot(gs[i, 0]) for i in range(4)] + ax1 = [fig.add_subplot(gs[i, 1], projection=proj) for i in range(4)] for i, (a0, a1) in enumerate(zip(ax0, ax1)): scores.sel(mode=i + 1).plot(ax=a0) @@ -131,12 +142,13 @@ Create figure showing the first two modes a0.set_xlabel("") plt.tight_layout() - plt.savefig("eof-smode.jpg") + plt.savefig("sparse_pca.jpg") + .. image-sg:: /auto_examples/1single/images/sphx_glr_plot_eof-smode_001.png - :alt: mode = 1, mode = 2, mode = 3, mode = 1, mode = 2, mode = 3 + :alt: mode = 1, mode = 2, mode = 3, mode = 4, mode = 1, mode = 2, mode = 3, mode = 4 :srcset: /auto_examples/1single/images/sphx_glr_plot_eof-smode_001.png :class: sphx-glr-single-img @@ -147,7 +159,7 @@ Create figure showing the first two modes .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 2.207 seconds) + **Total running time of the script:** (0 minutes 7.992 seconds) .. _sphx_glr_download_auto_examples_1single_plot_eof-smode.py: @@ -156,14 +168,17 @@ Create figure showing the first two modes .. container:: sphx-glr-footer sphx-glr-footer-example - .. container:: sphx-glr-download sphx-glr-download-jupyter - :download:`Download Jupyter notebook: plot_eof-smode.ipynb ` + .. container:: sphx-glr-download sphx-glr-download-python :download:`Download Python source code: plot_eof-smode.py ` + .. container:: sphx-glr-download sphx-glr-download-jupyter + + :download:`Download Jupyter notebook: plot_eof-smode.ipynb ` + .. only:: html diff --git a/docs/auto_examples/1single/plot_eof-smode_codeobj.pickle b/docs/auto_examples/1single/plot_eof-smode_codeobj.pickle index bd0de31..78dfade 100644 Binary files a/docs/auto_examples/1single/plot_eof-smode_codeobj.pickle and b/docs/auto_examples/1single/plot_eof-smode_codeobj.pickle differ diff --git a/docs/auto_examples/1single/sg_execution_times.rst b/docs/auto_examples/1single/sg_execution_times.rst index 757bb30..1693d43 100644 --- a/docs/auto_examples/1single/sg_execution_times.rst +++ b/docs/auto_examples/1single/sg_execution_times.rst @@ -6,59 +6,26 @@ Computation times ================= -**00:39.458** total execution time for 10 files **from auto_examples/1single**: - -.. container:: - - .. raw:: html - - - - - - - - .. list-table:: - :header-rows: 1 - :class: table table-striped sg-datatable - - * - Example - - Time - - Mem (MB) - * - :ref:`sphx_glr_auto_examples_1single_plot_gwpca.py` (``plot_gwpca.py``) - - 00:34.593 - - 0.0 - * - :ref:`sphx_glr_auto_examples_1single_plot_weighted-eof.py` (``plot_weighted-eof.py``) - - 00:01.928 - - 0.0 - * - :ref:`sphx_glr_auto_examples_1single_plot_mreof.py` (``plot_mreof.py``) - - 00:01.727 - - 0.0 - * - :ref:`sphx_glr_auto_examples_1single_plot_multivariate-eof.py` (``plot_multivariate-eof.py``) - - 00:01.211 - - 0.0 - * - :ref:`sphx_glr_auto_examples_1single_plot_complex_eof.py` (``plot_complex_eof.py``) - - 00:00.000 - - 0.0 - * - :ref:`sphx_glr_auto_examples_1single_plot_eeof.py` (``plot_eeof.py``) - - 00:00.000 - - 0.0 - * - :ref:`sphx_glr_auto_examples_1single_plot_eeof_trend.py` (``plot_eeof_trend.py``) - - 00:00.000 - - 0.0 - * - :ref:`sphx_glr_auto_examples_1single_plot_eof-smode.py` (``plot_eof-smode.py``) - - 00:00.000 - - 0.0 - * - :ref:`sphx_glr_auto_examples_1single_plot_eof-tmode.py` (``plot_eof-tmode.py``) - - 00:00.000 - - 0.0 - * - :ref:`sphx_glr_auto_examples_1single_plot_rotated_eof.py` (``plot_rotated_eof.py``) - - 00:00.000 - - 0.0 +**00:07.992** total execution time for **auto_examples_1single** files: + ++-----------------------------------------------------------------------------------------------+-----------+--------+ +| :ref:`sphx_glr_auto_examples_1single_plot_eof-smode.py` (``plot_eof-smode.py``) | 00:07.992 | 0.0 MB | ++-----------------------------------------------------------------------------------------------+-----------+--------+ +| :ref:`sphx_glr_auto_examples_1single_plot_complex_eof.py` (``plot_complex_eof.py``) | 00:00.000 | 0.0 MB | ++-----------------------------------------------------------------------------------------------+-----------+--------+ +| :ref:`sphx_glr_auto_examples_1single_plot_eeof.py` (``plot_eeof.py``) | 00:00.000 | 0.0 MB | ++-----------------------------------------------------------------------------------------------+-----------+--------+ +| :ref:`sphx_glr_auto_examples_1single_plot_eeof_trend.py` (``plot_eeof_trend.py``) | 00:00.000 | 0.0 MB | ++-----------------------------------------------------------------------------------------------+-----------+--------+ +| :ref:`sphx_glr_auto_examples_1single_plot_eof-tmode.py` (``plot_eof-tmode.py``) | 00:00.000 | 0.0 MB | ++-----------------------------------------------------------------------------------------------+-----------+--------+ +| :ref:`sphx_glr_auto_examples_1single_plot_gwpca.py` (``plot_gwpca.py``) | 00:00.000 | 0.0 MB | ++-----------------------------------------------------------------------------------------------+-----------+--------+ +| :ref:`sphx_glr_auto_examples_1single_plot_mreof.py` (``plot_mreof.py``) | 00:00.000 | 0.0 MB | ++-----------------------------------------------------------------------------------------------+-----------+--------+ +| :ref:`sphx_glr_auto_examples_1single_plot_multivariate-eof.py` (``plot_multivariate-eof.py``) | 00:00.000 | 0.0 MB | ++-----------------------------------------------------------------------------------------------+-----------+--------+ +| :ref:`sphx_glr_auto_examples_1single_plot_rotated_eof.py` (``plot_rotated_eof.py``) | 00:00.000 | 0.0 MB | ++-----------------------------------------------------------------------------------------------+-----------+--------+ +| :ref:`sphx_glr_auto_examples_1single_plot_weighted-eof.py` (``plot_weighted-eof.py``) | 00:00.000 | 0.0 MB | ++-----------------------------------------------------------------------------------------------+-----------+--------+ diff --git a/docs/auto_examples/auto_examples_jupyter.zip b/docs/auto_examples/auto_examples_jupyter.zip index ad387a5..1bfbde0 100644 Binary files a/docs/auto_examples/auto_examples_jupyter.zip and b/docs/auto_examples/auto_examples_jupyter.zip differ diff --git a/docs/auto_examples/auto_examples_python.zip b/docs/auto_examples/auto_examples_python.zip index a411b1e..d4acea5 100644 Binary files a/docs/auto_examples/auto_examples_python.zip and b/docs/auto_examples/auto_examples_python.zip differ diff --git a/docs/auto_examples/index.rst b/docs/auto_examples/index.rst index 67544df..42a470f 100644 --- a/docs/auto_examples/index.rst +++ b/docs/auto_examples/index.rst @@ -79,7 +79,7 @@ Here you can find some examples of how to use the library. .. raw:: html -
+
.. only:: html @@ -90,7 +90,7 @@ Here you can find some examples of how to use the library. .. raw:: html -
EOF analysis (S-mode)
+
Sparse PCA
diff --git a/examples/1single/plot_eof-smode.py b/examples/1single/plot_eof-smode.py index f9cd5f3..a81d00c 100644 --- a/examples/1single/plot_eof-smode.py +++ b/examples/1single/plot_eof-smode.py @@ -1,26 +1,35 @@ """ -EOF analysis (S-mode) +Sparse PCA ======================== -EOF analysis in S-mode maximises the temporal variance. +This example demonstrates the application of sparse PCA [1]_ to sea surface temperature data. Sparse PCA is an alternative to rotated PCA, where the components are sparse, often providing a more interpretable solution. + +We replicate the analysis from the original paper [1]_, which identifies the ENSO (El Niño-Southern Oscillation) as the fourth mode, representing about 1% of the total variance. The original study focused on weekly sea surface temperatures from satellite data, whereas we use monthly data from ERSSTv5. Consequently, our results may not match exactly, but they should be quite similar. + +References +---------- +.. [1] Erichson, N. B. et al. Sparse Principal Component Analysis via Variable Projection. SIAM J. Appl. Math. 80, 977-1002 (2020). + """ # Load packages and data: -import xarray as xr import matplotlib.pyplot as plt -from matplotlib.gridspec import GridSpec +import xarray as xr from cartopy.crs import EqualEarth, PlateCarree +from matplotlib.gridspec import GridSpec -from xeofs.models import EOF +from xeofs.models import SparsePCA # %% +# We use sea surface temperature data from 1990 to 2017, consistent with the original paper. sst = xr.tutorial.open_dataset("ersstv5")["sst"] +sst = sst.sel(time=slice("1990", "2017")) # %% -# Perform the actual analysis +# We perform sparse PCA using the `alpha` and `beta` parameters, which define the sparsity imposed by the elastic net (refer to Table 1 in the paper). In our analysis, we set `alpha` to 1e-5, as specified by the authors. Although the authors do not specify a value for `beta`, it appears that the results are not highly sensitive to this parameter. Therefore, we use the default `beta` value of 1e-4. -model = EOF(n_modes=5, use_coslat=True) +model = SparsePCA(n_modes=4, alpha=1e-5) model.fit(sst, dim="time") expvar = model.explained_variance() expvar_ratio = model.explained_variance_ratio() @@ -28,21 +37,21 @@ scores = model.scores() # %% -# Explained variance fraction +# The explained variance fraction confirms that the fourth mode explains about 1% of the total variance, which is consistent with the original paper. print("Explained variance: ", expvar.round(0).values) print("Relative: ", (expvar_ratio * 100).round(1).values) # %% -# Create figure showing the first two modes +# Examining the first four modes, we clearly identify ENSO as the fourth mode. proj = EqualEarth(central_longitude=180) kwargs = {"cmap": "RdBu", "vmin": -0.05, "vmax": 0.05, "transform": PlateCarree()} -fig = plt.figure(figsize=(10, 8)) -gs = GridSpec(3, 2, width_ratios=[1, 2]) -ax0 = [fig.add_subplot(gs[i, 0]) for i in range(3)] -ax1 = [fig.add_subplot(gs[i, 1], projection=proj) for i in range(3)] +fig = plt.figure(figsize=(10, 12)) +gs = GridSpec(4, 2, width_ratios=[1, 2]) +ax0 = [fig.add_subplot(gs[i, 0]) for i in range(4)] +ax1 = [fig.add_subplot(gs[i, 1], projection=proj) for i in range(4)] for i, (a0, a1) in enumerate(zip(ax0, ax1)): scores.sel(mode=i + 1).plot(ax=a0) @@ -52,4 +61,6 @@ a0.set_xlabel("") plt.tight_layout() -plt.savefig("eof-smode.jpg") +plt.savefig("sparse_pca.jpg") + +# %% diff --git a/examples/1single/sparse_pca.jpg b/examples/1single/sparse_pca.jpg new file mode 100644 index 0000000..33df23c Binary files /dev/null and b/examples/1single/sparse_pca.jpg differ diff --git a/tests/models/test_sparse_pca.py b/tests/models/test_sparse_pca.py new file mode 100644 index 0000000..ffc594a --- /dev/null +++ b/tests/models/test_sparse_pca.py @@ -0,0 +1,569 @@ +import numpy as np +import pytest +import xarray as xr + +from xeofs.models import SparsePCA + + +def test_init(): + """Tests the initialization of the SparsePCA class""" + spca = SparsePCA(n_modes=5, standardize=True, use_coslat=True) + + # Assert preprocessor has been initialized + assert hasattr(spca, "_params") + assert hasattr(spca, "preprocessor") + + +@pytest.mark.parametrize( + "dim", + [ + (("time",)), + (("lat", "lon")), + (("lon", "lat")), + ], +) +def test_fit(dim, mock_data_array): + """Tests the fit method of the SparsePCA class""" + + spca = SparsePCA() + spca.fit(mock_data_array, dim) + + # Assert the required attributes have been set + assert hasattr(spca, "preprocessor") + assert hasattr(spca, "data") + + +@pytest.mark.parametrize( + "dim", + [ + (("time",)), + (("lat", "lon")), + (("lon", "lat")), + ], +) +def test_explained_variance(dim, mock_data_array): + """Tests the explained_variance method of the SparsePCA class""" + spca = SparsePCA() + spca.fit(mock_data_array, dim) + + # Test explained_variance method + explained_variance = spca.explained_variance() + assert isinstance(explained_variance, xr.DataArray) + # Explained variance must be positive + assert (explained_variance > 0).all() + + +@pytest.mark.parametrize( + "dim", + [ + (("time",)), + (("lat", "lon")), + (("lon", "lat")), + ], +) +def test_explained_variance_ratio(dim, mock_data_array): + """Tests the explained_variance_ratio method of the SparsePCA class""" + spca = SparsePCA() + spca.fit(mock_data_array, dim) + + # Test explained_variance_ratio method + explained_variance_ratio = spca.explained_variance_ratio() + assert isinstance(explained_variance_ratio, xr.DataArray) + # Explained variance ratio must be positive + assert ( + explained_variance_ratio > 0 + ).all(), "The explained variance ratio must be positive" + # The sum of the explained variance ratio must be <= 1 + assert ( + explained_variance_ratio.sum() <= 1 + 1e-5 + ), "The sum of the explained variance ratio must be <= 1" + + +@pytest.mark.parametrize( + "dim", + [ + (("time",)), + (("lat", "lon")), + (("lon", "lat")), + ], +) +def test_isolated_nans(dim, mock_data_array_isolated_nans): + """Tests the components method of the SparsePCA class""" + spca = SparsePCA() + with pytest.raises(ValueError): + spca.fit(mock_data_array_isolated_nans, dim) + + +@pytest.mark.parametrize( + "dim", + [ + (("time",)), + (("lat", "lon")), + (("lon", "lat")), + ], +) +def test_components(dim, mock_data_array): + """Tests the components method of the SparsePCA class""" + spca = SparsePCA() + spca.fit(mock_data_array, dim) + + # Test components method + components = spca.components() + feature_dims = tuple(set(mock_data_array.dims) - set(dim)) + assert isinstance(components, xr.DataArray), "Components is not a DataArray" + assert set(components.dims) == set( + ("mode",) + feature_dims + ), "Components does not have the right feature dimensions" + + +@pytest.mark.parametrize( + "dim", + [ + (("time",)), + (("lat", "lon")), + (("lon", "lat")), + ], +) +def test_components_fulldim_nans(dim, mock_data_array_full_dimensional_nans): + """Tests the components method of the SparsePCA class""" + spca = SparsePCA() + spca.fit(mock_data_array_full_dimensional_nans, dim) + + # Test components method + components = spca.components() + feature_dims = tuple(set(mock_data_array_full_dimensional_nans.dims) - set(dim)) + assert isinstance(components, xr.DataArray), "Components is not a DataArray" + assert set(components.dims) == set( + ("mode",) + feature_dims + ), "Components does not have the right feature dimensions" + + +@pytest.mark.parametrize( + "dim", + [ + (("time",)), + (("lat", "lon")), + (("lon", "lat")), + ], +) +def test_components_boundary_nans(dim, mock_data_array_boundary_nans): + """Tests the components method of the SparsePCA class""" + spca = SparsePCA() + spca.fit(mock_data_array_boundary_nans, dim) + + # Test components method + components = spca.components() + feature_dims = tuple(set(mock_data_array_boundary_nans.dims) - set(dim)) + assert isinstance(components, xr.DataArray), "Components is not a DataArray" + assert set(components.dims) == set( + ("mode",) + feature_dims + ), "Components does not have the right feature dimensions" + + +@pytest.mark.parametrize( + "dim", + [ + (("time",)), + (("lat", "lon")), + (("lon", "lat")), + ], +) +def test_components_dataset(dim, mock_dataset): + """Tests the components method of the SparsePCA class""" + spca = SparsePCA() + spca.fit(mock_dataset, dim) + + # Test components method + components = spca.components() + feature_dims = tuple(set(mock_dataset.dims) - set(dim)) + assert isinstance(components, xr.Dataset), "Components is not a Dataset" + assert set(components.data_vars) == set( + mock_dataset.data_vars + ), "Components does not have the same data variables as the input Dataset" + assert set(components.dims) == set( + ("mode",) + feature_dims + ), "Components does not have the right feature dimensions" + + +@pytest.mark.parametrize( + "dim", + [ + (("time",)), + (("lat", "lon")), + (("lon", "lat")), + ], +) +def test_components_dataarray_list(dim, mock_data_array_list): + """Tests the components method of the SparsePCA class""" + spca = SparsePCA() + spca.fit(mock_data_array_list, dim) + + # Test components method + components = spca.components() + feature_dims = [tuple(set(data.dims) - set(dim)) for data in mock_data_array_list] + assert isinstance(components, list), "Components is not a list" + assert len(components) == len( + mock_data_array_list + ), "Components does not have the same length as the input list" + assert isinstance( + components[0], xr.DataArray + ), "Components is not a list of DataArrays" + for comp, feat_dims in zip(components, feature_dims): + assert set(comp.dims) == set( + ("mode",) + feat_dims + ), "Components does not have the right feature dimensions" + + +@pytest.mark.parametrize( + "dim", + [ + (("time",)), + (("lat", "lon")), + (("lon", "lat")), + ], +) +def test_scores(dim, mock_data_array): + """Tests the scores method of the SparsePCA class""" + spca = SparsePCA() + spca.fit(mock_data_array, dim) + + # Test scores method + scores = spca.scores() + assert isinstance(scores, xr.DataArray), "Scores is not a DataArray" + assert set(scores.dims) == set( + (dim + ("mode",)) + ), "Scores does not have the right dimensions" + + +@pytest.mark.parametrize( + "dim", + [ + (("time",)), + (("lat", "lon")), + (("lon", "lat")), + ], +) +def test_scores_fulldim_nans(dim, mock_data_array_full_dimensional_nans): + """Tests the scores method of the SparsePCA class""" + spca = SparsePCA() + spca.fit(mock_data_array_full_dimensional_nans, dim) + + # Test scores method + scores = spca.scores() + assert isinstance(scores, xr.DataArray), "Scores is not a DataArray" + assert set(scores.dims) == set( + (dim + ("mode",)) + ), "Scores does not have the right dimensions" + + +@pytest.mark.parametrize( + "dim", + [ + (("time",)), + (("lat", "lon")), + (("lon", "lat")), + ], +) +def test_scores_boundary_nans(dim, mock_data_array_boundary_nans): + """Tests the scores method of the SparsePCA class""" + spca = SparsePCA() + spca.fit(mock_data_array_boundary_nans, dim) + + # Test scores method + scores = spca.scores() + assert isinstance(scores, xr.DataArray), "Scores is not a DataArray" + assert set(scores.dims) == set( + (dim + ("mode",)) + ), "Scores does not have the right dimensions" + + +@pytest.mark.parametrize( + "dim", + [ + (("time",)), + (("lat", "lon")), + (("lon", "lat")), + ], +) +def test_scores_dataset(dim, mock_dataset): + """Tests the scores method of the SparsePCA class""" + spca = SparsePCA() + spca.fit(mock_dataset, dim) + + # Test scores method + scores = spca.scores() + assert isinstance(scores, xr.DataArray) + assert set(scores.dims) == set( + (dim + ("mode",)) + ), "Scores does not have the right dimensions" + + +@pytest.mark.parametrize( + "dim", + [ + (("time",)), + (("lat", "lon")), + (("lon", "lat")), + ], +) +def test_scores_dataarray_list(dim, mock_data_array_list): + """Tests the scores method of the SparsePCA class""" + spca = SparsePCA() + spca.fit(mock_data_array_list, dim) + + # Test scores method + scores = spca.scores() + assert isinstance(scores, xr.DataArray) + assert set(scores.dims) == set( + (dim + ("mode",)) + ), "Scores does not have the right dimensions" + + +def test_get_params(): + """Tests the get_params method of the SparsePCA class""" + spca = SparsePCA(n_modes=5, standardize=True, use_coslat=True, alpha=0.4) + + # Test get_params method + params = spca.get_params() + assert isinstance(params, dict) + assert params.get("n_modes") == 5 + assert params.get("standardize") is True + assert params.get("use_coslat") is True + assert params.get("alpha") == 0.4 + + +@pytest.mark.parametrize( + "dim", + [ + (("time",)), + (("lat", "lon")), + (("lon", "lat")), + ], +) +def test_transform(dim, mock_data_array): + """Test projecting new unseen data onto the components (SparsePCAs/eigenvectors)""" + data = mock_data_array.isel({dim[0]: slice(1, None)}) + new_data = mock_data_array.isel({dim[0]: slice(0, 1)}) + + # Create a xarray DataArray with random data + model = SparsePCA(n_modes=2, solver="full") + model.fit(data, dim) + scores = model.scores() + + # Project data onto the components + projections = model.transform(data) + + # Check that the projection has the right dimensions + assert projections.dims == scores.dims, "Projection has wrong dimensions" # type: ignore + + # Check that the projection has the right data type + assert isinstance(projections, xr.DataArray), "Projection is not a DataArray" + + # Check that the projection has the right name + assert projections.name == "scores", "Projection has wrong name: {}".format( + projections.name + ) + + # Check that the projection's data is the same as the scores + np.testing.assert_allclose( + scores.sel(mode=slice(1, 3)), projections.sel(mode=slice(1, 3)), rtol=1e-3 + ) + + # Project unseen data onto the components + new_projections = model.transform(new_data) + + # Check that the projection has the right dimensions + assert new_projections.dims == scores.dims, "Projection has wrong dimensions" # type: ignore + + # Check that the projection has the right data type + assert isinstance(new_projections, xr.DataArray), "Projection is not a DataArray" + + # Check that the projection has the right name + assert new_projections.name == "scores", "Projection has wrong name: {}".format( + new_projections.name + ) + + # Ensure that the new projections are not NaNs + assert np.all(new_projections.notnull().values), "New projections contain NaNs" + + +@pytest.mark.parametrize( + "dim", + [ + (("time",)), + (("lat", "lon")), + (("lon", "lat")), + ], +) +def test_transform_nan_feature(dim, mock_data_array): + """Test projecting new unseen data onto the components (SparsePCAs/eigenvectors)""" + data = mock_data_array.isel() + + # Create a xarray DataArray with random data + model = SparsePCA(n_modes=2, solver="full") + model.fit(data, dim) + + # Set a new feature to NaN and attempt to project data onto the components + feature_dims = list(set(data.dims) - set(dim)) + data_missing = data.copy() + data_missing.loc[{feature_dims[0]: data[feature_dims[0]][0].values}] = np.nan + + # with nan checking, transform should fail if any new features are NaN + with pytest.raises(ValueError): + model.transform(data_missing) + + # without nan checking, transform will succeed but be all nan + model = SparsePCA(n_modes=2, solver="full", check_nans=False) + model.fit(data, dim) + + data_transformed = model.transform(data_missing) + assert data_transformed.isnull().all() + + +@pytest.mark.parametrize( + "dim, normalized", + [ + (("time",), True), + (("lat", "lon"), True), + (("lon", "lat"), True), + (("time",), False), + (("lat", "lon"), False), + (("lon", "lat"), False), + ], +) +def test_inverse_transform(dim, mock_data_array, normalized): + """Test inverse_transform method in SparsePCA class.""" + + # instantiate the SparsePCA class with necessary parameters + spca = SparsePCA( + n_modes=20, + alpha=1e-5, + beta=1e-5, + standardize=True, + max_iter=2000, + tol=1e-9, + solver="full", + ) + + # fit the SparsePCA model + spca.fit(mock_data_array, dim=dim) + scores = spca.scores(normalized=normalized) + + # Test with single mode + scores_selection = scores.sel(mode=1) + X_rec_1 = spca.inverse_transform(scores_selection) + assert isinstance(X_rec_1, xr.DataArray) + + # Test with single mode as list + scores_selection = scores.sel(mode=[1]) + X_rec_1_list = spca.inverse_transform(scores_selection) + assert isinstance(X_rec_1_list, xr.DataArray) + + # Single mode and list should be equal + xr.testing.assert_allclose(X_rec_1, X_rec_1_list) + + # Test with all modes + X_rec = spca.inverse_transform(scores, normalized=normalized) + assert isinstance(X_rec, xr.DataArray) + + # Check that the reconstructed data has the same dimensions as the original data + assert set(X_rec.dims) == set(mock_data_array.dims) + + # Reconstructed data should be close to the original data + orig_dim_order = mock_data_array.dims + X_rec = X_rec.transpose(*orig_dim_order) + xr.testing.assert_allclose(mock_data_array, X_rec, rtol=1e-2, atol=1e-2) + + +@pytest.mark.parametrize( + "dim", + [ + (("time",)), + (("lat", "lon")), + (("lon", "lat")), + ], +) +@pytest.mark.parametrize("engine", ["netcdf4", "zarr"]) +def test_save_load(dim, mock_data_array, tmp_path, engine): + """Test save/load methods in SparsePCA class, ensuring that we can + roundtrip the model and get the same results when transforming + data.""" + original = SparsePCA() + original.fit(mock_data_array, dim) + + # Save the SparsePCA model + original.save(tmp_path / "spca", engine=engine) + + # Check that the SparsePCA model has been saved + assert (tmp_path / "spca").exists() + + # Recreate the model from saved file + loaded = SparsePCA.load(tmp_path / "spca", engine=engine) + + # Check that the params and DataContainer objects match + assert original.get_params() == loaded.get_params() + assert all([key in loaded.data for key in original.data]) + for key in original.data: + if original.data._allow_compute[key]: + assert loaded.data[key].equals(original.data[key]) + else: + # but ensure that input data is not saved by default + assert loaded.data[key].size <= 1 + assert loaded.data[key].attrs["placeholder"] is True + + # Test that the recreated model can be used to transform new data + assert np.allclose( + original.transform(mock_data_array), loaded.transform(mock_data_array) + ) + + # The loaded model should also be able to inverse_transform new data + assert np.allclose( + original.inverse_transform(original.scores()), + loaded.inverse_transform(loaded.scores()), + ) + + +@pytest.mark.parametrize( + "dim", + [ + (("time",)), + (("lat", "lon")), + (("lon", "lat")), + ], +) +def test_serialize_deserialize_dataarray(dim, mock_data_array): + """Test roundtrip serialization when the model is fit on a DataArray.""" + model = SparsePCA() + model.fit(mock_data_array, dim) + dt = model.serialize() + rebuilt_model = SparsePCA.deserialize(dt) + assert np.allclose( + model.transform(mock_data_array), rebuilt_model.transform(mock_data_array) + ) + + +@pytest.mark.parametrize( + "dim", + [ + (("time",)), + (("lat", "lon")), + (("lon", "lat")), + ], +) +def test_serialize_deserialize_dataset(dim, mock_dataset): + """Test roundtrip serialization when the model is fit on a Dataset.""" + model = SparsePCA() + model.fit(mock_dataset, dim) + dt = model.serialize() + rebuilt_model = SparsePCA.deserialize(dt) + assert np.allclose( + model.transform(mock_dataset), rebuilt_model.transform(mock_dataset) + ) + + +def test_complex_input_inverse_transform(mock_complex_data_array): + """Test that the SparsePCA model raises an error with complex input data.""" + + model = SparsePCA(n_modes=128) + with pytest.raises(TypeError): + model.fit(mock_complex_data_array, dim="time") diff --git a/xeofs/models/__init__.py b/xeofs/models/__init__.py index 49ee8ec..11d2d29 100644 --- a/xeofs/models/__init__.py +++ b/xeofs/models/__init__.py @@ -1,13 +1,13 @@ +from .cca import CCA +from .eeof import ExtendedEOF from .eof import EOF, ComplexEOF +from .eof_rotator import ComplexEOFRotator, EOFRotator +from .gwpca import GWPCA from .mca import MCA, ComplexMCA -from .eeof import ExtendedEOF +from .mca_rotator import ComplexMCARotator, MCARotator from .opa import OPA -from .gwpca import GWPCA from .rotator_factory import RotatorFactory -from .eof_rotator import EOFRotator, ComplexEOFRotator -from .mca_rotator import MCARotator, ComplexMCARotator -from .cca import CCA - +from .sparse_pca import SparsePCA __all__ = [ "EOF", @@ -23,4 +23,5 @@ "ComplexMCARotator", "CCA", "RotatorFactory", + "SparsePCA", ] diff --git a/xeofs/models/_np_classes/__init__.py b/xeofs/models/_np_classes/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/xeofs/models/_np_classes/_sparse_pca.py b/xeofs/models/_np_classes/_sparse_pca.py new file mode 100644 index 0000000..a837603 --- /dev/null +++ b/xeofs/models/_np_classes/_sparse_pca.py @@ -0,0 +1,697 @@ +""" +Sparse PCA via Variable Projection [1]_. + +This module provides an implementation of Sparse PCA via Variable Projection, +based heavily on the code from the Ristretto library (https://github.com/erichson/ristretto). + +We have made modifications to adapt the algorithm to be used with delayed Dask objects. + +This code is licensed under the MIT License. The original library is licensed under the GPL-3.0 License. + +References +---------- + +.. [1] Erichson, N. B. et al. Sparse Principal Component Analysis via Variable Projection. SIAM J. Appl. Math. 80, 977-1002 (2020). + +""" + +import dask.array.core +import dask.array.linalg +import dask.array.random +import numpy as np +import scipy +from sklearn.utils import check_random_state + +from ...utils.data_types import DaskArray + + +def random_gaussian_map(A, k, random_state): + """generate random gaussian map + + Parameters + ---------- + A : array_like, shape `(n, p)`. + Input array. + k : integer + Target rank. + random_state : RandomState instance + Random number generator. + + Returns + ------- + Omega : array_like, `(p, k)`. + Random gaussian matrix. + + """ + + # TODO: adapt for complex-valued data + + if isinstance(A, DaskArray): + Omega = dask.array.random.standard_normal( + size=(A.shape[1], k), chunks=(A.chunks[1], -1) + ) + return Omega.astype(A.dtype) + else: + return random_state.standard_normal(size=(A.shape[1], k)).astype(A.dtype) + + +def johnson_lindenstrauss(A, k, random_state=None): + """Johnson-Lindenstrauss Random Projection. + + Parameters + ---------- + A : array_like, shape `(n, p)`. + Input array. + k : integer + Target rank. + random_state : integer, RandomState instance or None + If integer, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used by np.random. + + Returns + ------- + array_like, `(n, k)`. + Projected matrix. + + """ + # TODO: adapt for complex-valued data + random_state = check_random_state(random_state) + + if A.ndim != 2: + raise ValueError("A must be a 2D array, not %dD" % A.ndim) + + # construct gaussian random matrix + Omega = random_gaussian_map(A, k, random_state) + + # project A onto Omega + return A.dot(Omega) + + +def orthonormalize(A, overwrite_a=True, check_finite=False): + """orthonormalize the columns of A via QR decomposition + + Parameters + ---------- + A : array_like, shape `(n, p)`. + Input array. + overwrite_a : bool, optional + Whether to overwrite data in A (may improve performance). + check_finite : bool, optional + Whether to check that the input matrix contains only finite numbers. + + Returns + ------- + Q : array_like, `(n, p)`. + Orthonormal basis matrix. + + """ + # TODO: adapt for complex-valued data + if isinstance(A, DaskArray): + # NOTE: We expect A to be tall-and-skinny matrix that is chunked along rows + # This means our input data is assumed to be chunked along the feature dimensions + try: + Q, _ = dask.array.linalg.tsqr(A) + except ValueError: + raise ValueError( + "Data not chunked correctly. Ensure that the data is chunked along the feature dimensions." + ) + else: + # NOTE: for A(n, p) 'economic' returns Q(n, k), R(k, p) where k is min(n, p) + # TODO: when does overwrite_a even work? (fortran?) + QR = scipy.linalg.qr( + A, + overwrite_a=overwrite_a, + check_finite=check_finite, + mode="economic", + pivoting=False, + ) + Q = QR[0] + return Q + + +def perform_subspace_iterations(A, Q, n_iter=2): + """perform subspace iterations on Q + + Parameters + ---------- + A : array_like, shape `(n, p)`. + Input array. + Q : array_like, shape `(n, k)`. + Orthonormal basis matrix. + n_iter : integer, default: 2. + Number of subspace iterations. + + Returns + ------- + Q : array_like, `(n, k)`. + Orthonormal basis matrix. + + """ + # TODO: adapt for complex-valued data + # orthonormalize Y, overwriting + Q = orthonormalize(Q) + + # perform subspace iterations + for _ in range(n_iter): + Z = orthonormalize(A.T.dot(Q)) + Q = orthonormalize(A.dot(Z)) + + return Q + + +def conjugate_transpose(A): + """Performs conjugate transpose of A""" + if np.iscomplexobj(A): + return A.conj().T + return A.T + + +def _compute_rqb(A, rank, oversample=10, n_subspace=2, random_state=None): + """Randomized QB Decomposition. + + Randomized algorithm for computing the approximate low-rank QB + decomposition of a rectangular `(n, p)` matrix `A`, with target rank + `rank << min{n, p}`. The input matrix is factored as `A = Q * B`. + + The quality of the approximation can be controlled via the oversampling + parameter `oversample` and `n_subspace` which specifies the number of + subspace iterations. + + Parameters + ---------- + A : array_like, shape `(n, p)`. + Input array. + rank : integer + Target rank. Best if `rank << min{n, p}` + oversample : integer, optional (default: 10) + Controls the oversampling of column space. Increasing this parameter + may improve numerical accuracy. + n_subspace : integer, default: 2. + Parameter to control number of subspace iterations. Increasing this + parameter may improve numerical accuracy. Every additional subspace + iterations requires an additional full pass over the data matrix. + random_state : integer, RandomState instance or None + If integer, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used by np.random. + + Returns + ------- + Q: array_like, shape `(n, rank + oversample)`. + Orthonormal basis matrix. + B : array_like, shape `(rank + oversample, p)`. + Smaller matrix. + + """ + # TODO: adapt for complex-valued data + Q = johnson_lindenstrauss(A, rank + oversample, random_state=random_state) + + if n_subspace > 0: + Q = perform_subspace_iterations(A, Q, n_iter=n_subspace) + else: + Q = orthonormalize(Q) + + # Project the data matrix a into a lower dimensional subspace + B = conjugate_transpose(Q).dot(A) + + return Q, B + + +def compute_rqb(A, rank, oversample=20, n_subspace=2, n_blocks=1, random_state=None): + """Randomized QB Decomposition. + + Randomized algorithm for computing the approximate low-rank QB + decomposition of a rectangular `(n, p)` matrix `A`, with target rank + `rank << min{n, p}`. The input matrix is factored as `A = Q * B`. + + The quality of the approximation can be controlled via the oversampling + parameter `oversample` and `n_subspace` which specifies the number of + subspace iterations. + + Parameters + ---------- + A : array_like, shape `(n, p)`. + Input array. + + rank : integer + Target rank. Best if `rank << min{n, p}` + + oversample : integer, optional (default: 10) + Controls the oversampling of column space. Increasing this parameter + may improve numerical accuracy. + + n_subspace : integer, default: 2. + Parameter to control number of subspace iterations. Increasing this + parameter may improve numerical accuracy. Every additional subspace + iterations requires an additional full pass over the data matrix. + + n_blocks : integer, default: 1. + If `n_blocks > 1` a column blocked QB decomposition procedure will be + performed. A larger number requires less fast memory, while it + leads to a higher computational time. + + random_state : integer, RandomState instance or None, optional (default `None`) + If integer, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used by np.random. + + Returns + ------- + Q: array_like, shape `(n, rank + oversample)`. + Orthonormal basis matrix. + + B : array_like, shape `(rank + oversample, p)`. + Smaller matrix. + + References + ---------- + N. Halko, P. Martinsson, and J. Tropp. + "Finding structure with randomness: probabilistic + algorithms for constructing approximate matrix + decompositions" (2009). + (available at `arXiv `_). + + S. Voronin and P.Martinsson. + "RSVDPACK: Subroutines for computing partial singular value + decompositions via randomized sampling on single core, multi core, + and GPU architectures" (2015). + (available at `arXiv `_). + + """ + # TODO: adapt for complex-valued data + if n_blocks > 1: + if isinstance(A, DaskArray): + raise ValueError( + f"For delayed-dask computation n_blocks must be 1, but found n_blocks={n_blocks}." + ) + n, _ = A.shape + + # index sets + row_sets = np.array_split(range(n), n_blocks) + + Q_block = [] + K = [] + + nblock = 1 + for rows in row_sets: + # converts A to array, raise ValueError if A has inf or nan + Qtemp, Ktemp = _compute_rqb( + A[rows, :], + rank=rank, + oversample=oversample, + n_subspace=n_subspace, + random_state=random_state, + ) + + Q_block.append(Qtemp) + K.append(Ktemp) + nblock += 1 + + Q_small, B = _compute_rqb( + np.concatenate(K, axis=0), + rank=rank, + oversample=oversample, + n_subspace=n_subspace, + random_state=random_state, + ) + + Q_small = np.vsplit(Q_small, n_blocks) + + Q = [Q_block[i].dot(Q_small[i]) for i in range(n_blocks)] + Q = np.concatenate(Q, axis=0) + + else: + Q, B = _compute_rqb( + A, + rank=rank, + oversample=oversample, + n_subspace=n_subspace, + random_state=random_state, + ) + + return Q, B + + +def soft_l0(arr, thresh): + """Soft threshold operator for l0 regularization. + + Parameters + ---------- + arr : array_like + Input array. + thresh : float + Threshold value. + + Returns + ------- + array_like + Thresholded array. + + """ + # TODO: adapt for complex-valued data + idx = arr**2 < 2 * thresh + arr[idx] = 0 + return arr + + +def soft_l1(arr, thresh): + """Soft threshold operator for l1 regularization. + + Parameters + ---------- + arr : array_like + Input array. + thresh : float + Threshold value. + + Returns + ------- + array_like + Thresholded array. + + """ + # TODO: adapt for complex-valued data + return np.sign(arr) * np.maximum(np.abs(arr) - thresh, 0) + + +def compute_residual(X, B, A): + # TODO: adapt for complex-valued data + return X - X.dot(B).dot(A.T) + + +def compute_spca( + X, + n_components=None, + alpha=0.1, + beta=1e-5, + gamma=0.1, + robust=False, + regularizer="l1", + max_iter=1e3, + tol=1e-5, + compute=False, +): + r"""Sparse Principal Component Analysis (SPCA). + + Given a mean centered rectangular matrix `A` with shape `(m, n)`, SPCA + computes a set of sparse components that can optimally reconstruct the + input data. The amount of sparseness is controllable by the coefficient + of the L1 penalty, given by the parameter alpha. In addition, some ridge + shrinkage can be applied in order to improve conditioning. + + + Parameters + ---------- + X : array_like, shape `(n, p)`. + Input array. + + n_components : integer, `n_components << min{m,n}`. + Target rank, i.e., number of sparse components to be computed. + + alpha : float, (default ``alpha = 0.1``). + Sparsity controlling parameter. Higher values lead to sparser components. + + beta : float, (default ``beta = 1e-5``). + Amount of ridge shrinkage to apply in order to improve conditionin. + + regularizer : string {'l0', 'l1'}. + Type of sparsity-inducing regularizer. The l1 norm (also known as LASSO) + leads to softhreshold operator (default). The l0 norm is implemented + via a hardthreshold operator. + + robust : bool ``{'True', 'False'}``, optional (default ``False``). + Use a robust algorithm to compute the sparse PCA. + + max_iter : integer, (default ``max_iter = 500``). + Maximum number of iterations to perform before exiting. + + tol : float, (default ``tol = 1e-5``). + Stopping tolerance for reconstruction error. + + compute: bool + Whether to eagerly compute the rotation matrix. If ``True``, then + the rotation algorithm will check for convergence at each iteration + and stop once reached. If ``False``, then the rotation algorithm + can build the matrix lazily, but it will not check for convergence + and will run all of ``max_iter``. + + Returns + ------- + B: array_like, `(p, n_components)`. + Sparse components extracted from the data. + + A : array_like, `(p, n_components)`. + Orthogonal components extracted from the data. + + eigvals : array_like, `(n_components)`. + Eigenvalues correspnding to the extracted components. + + Notes + ----- + Variable Projection for PCA solves the following optimization problem: + minimize :math:`1/2 \| X - X B A^T \|^2 + \alpha \|B\|_1 + 1/2 \beta \|B\|^2` + """ + # TODO: adapt for complex-valued data + + if isinstance(X, DaskArray): + + def svd_algorithm(X): + return dask.array.linalg.svd(X) + else: + # TODO: can we pass the arguments without using functions? + # linalg.svd(X, full_matrices=False, overwrite_a=False) + # perhaps we can use Decomposer class to handle this + def svd_algorithm(X): + return scipy.linalg.svd(X, full_matrices=False, overwrite_a=False) + + if regularizer == "l1": + regularizer_func = soft_l1 + elif regularizer == "l0": + if robust: + raise NotImplementedError( + "l0 regularization is not supported for " "robust sparse pca" + ) + regularizer_func = soft_l0 + else: + raise ValueError( + 'regularizer must be one of ("l1", "l0"), not ' "%s." % regularizer + ) + + m, n = X.shape + if n_components is not None: + if n_components > n: + raise ValueError( + "n_components must be less than the number " "of columns of X (%d)" % n + ) + else: + n_components = n + + # Initialization of Variable Projection Solver + U, D, Vt = svd_algorithm(X) + Dmax = D[0] # l2 norm + + A = Vt[:n_components].T + B = Vt[:n_components].T + + if robust: + U = U[:, :n_components] + Vt = Vt[:n_components] + S = np.zeros_like(X) + else: + # compute outside the loop + VD = Vt.T * D + VD2 = Vt.T * D**2 + + # Set Tuning Parameters + alpha *= Dmax**2 + beta *= Dmax**2 + nu = 1.0 / (Dmax**2 + beta) + kappa = nu * alpha + + obj = [] # values of objective function + n_iter = 0 + + # Apply Variable Projection Solver + while max_iter > n_iter: + # Update A: + # X'XB = UDV' + # Compute X'XB via SVD of X + if robust: + XS = X - S + XB = X.dot(B) + Z = (XS).T.dot(XB) + else: + Z = VD2.dot(Vt.dot(B)) + + Utilde, Dtilde, Vttilde = svd_algorithm(Z) + A = Utilde.dot(Vttilde) + + # Proximal Gradient Descent to Update B + if robust: + R = XS - XB.dot(A.T) + G = X.T.dot(R.dot(A)) - beta * B + else: + G = VD2.dot(Vt.dot(A - B)) - beta * B + + B = regularizer_func(B + nu * G, kappa) + + if robust: + R = compute_residual(X, B, A) + S = soft_l1(R, gamma) + R -= S + else: + R = compute_residual(VD.T, B, A) + + objective = ( + 0.5 * np.sum(R**2) + alpha * np.sum(np.abs(B)) + 0.5 * beta * np.sum(B**2) + ) + if robust: + objective += gamma * np.sum(np.abs(S)) + + obj.append(objective) + + # Break if obj is not improving anymore + if compute and n_iter > 0 and abs(obj[-2] - obj[-1]) / obj[-1] < tol: + break + + # Next iter + n_iter += 1 + + eigen_values = Dtilde / (m - 1) + + return B, A, eigen_values + + +def compute_rspca( + X, + n_components, + alpha=0.1, + beta=0.1, + max_iter=1e3, + regularizer="l1", + tol=1e-5, + oversample=50, + n_subspace=2, + n_blocks=1, + robust=False, + random_state=None, + compute=False, +): + r"""Randomized Sparse Principal Component Analysis (rSPCA). + + Given a mean centered rectangular matrix `A` with shape `(n, p)`, SPCA + computes a set of sparse components that can optimally reconstruct the + input data. The amount of sparseness is controllable by the coefficient + of the L1 penalty, given by the parameter alpha. In addition, some ridge + shrinkage can be applied in order to improve conditioning. + + This algorithm uses randomized methods for linear algebra to accelerate + the computations. + + The quality of the approximation can be controlled via the oversampling + parameter `oversample` and `n_subspace` which specifies the number of + subspace iterations. + + + Parameters + ---------- + X : array_like, shape `(n, p)`. + Real nonnegative input matrix. + + n_components : integer, `n_components << min{m,n}`. + Target rank, i.e., number of sparse components to be computed. + + alpha : float, (default ``alpha = 0.1``). + Sparsity controlling parameter. Higher values lead to sparser components. + + beta : float, (default ``beta = 0.1``). + Amount of ridge shrinkage to apply in order to improve conditionin. + + regularizer : string {'l0', 'l1'}. + Type of sparsity-inducing regularizer. The l1 norm (also known as LASSO) + leads to softhreshold operator (default). The l0 norm is implemented + via a hardthreshold operator. + + max_iter : integer, (default ``max_iter = 500``). + Maximum number of iterations to perform before exiting. + + tol : float, (default ``tol = 1e-5``). + Stopping tolerance for reconstruction error. + + verbose : bool ``{'True', 'False'}``, optional (default ``verbose = True``). + Display progress. + + oversample : integer, optional (default: 10) + Controls the oversampling of column space. Increasing this parameter + may improve numerical accuracy. + + n_subspace : integer, default: 2. + Parameter to control number of subspace iterations. Increasing this + parameter may improve numerical accuracy. + + n_blocks : integer, default: 2. + Paramter to control in how many blocks of columns the input matrix + should be split. A larger number requires less fast memory, while it + leads to a higher computational time. + + random_state : integer, RandomState instance or None, optional (default ``None``) + If integer, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used by np.random. + + Returns + ------- + B: array_like, `(p, n_components)`. + Sparse components extracted from the data. + + A : array_like, `(p, n_components)`. + Orthogonal components extracted from the data. + + eigvals : array_like, `(n_components)`. + Eigenvalues correspnding to the extracted components. + + S : array_like, `(n, p)`. + Sparse component which captures grossly corrupted entries in the data + matrix. Returned only if `robust == True` + + obj : array_like, `(n_iter)`. + Objective value at the i-th iteration. + + Notes + ----- + Variable Projection for SPCA solves the following optimization problem: + minimize :math:`1/2 \| X - X B A^T \|^2 + \alpha \|B\|_1 + 1/2 \beta \|B\|^2` + """ + + # TODO: adapt for complex-valued data + + # Shape of data matrix + m = X.shape[0] + + # Compute QB decomposition + _, Xcompressed = compute_rqb( + X, + rank=n_components, + oversample=oversample, + n_subspace=n_subspace, + n_blocks=n_blocks, + random_state=random_state, + ) + + # Compute Sparse PCA + B, A, eigen_values = compute_spca( + Xcompressed, + n_components=n_components, + alpha=alpha, + beta=beta, + regularizer=regularizer, + max_iter=max_iter, + tol=tol, + robust=robust, + compute=compute, + ) + # rescale eigen values + eigen_values *= (n_components + oversample - 1) / (m - 1) + + return B, A, eigen_values diff --git a/xeofs/models/cca.py b/xeofs/models/cca.py index d50ec43..658bfbb 100644 --- a/xeofs/models/cca.py +++ b/xeofs/models/cca.py @@ -19,12 +19,11 @@ from sklearn.utils.validation import FLOAT_DTYPES from typing_extensions import Self -from xeofs.models import EOF - from .._version import __version__ from ..preprocessing.preprocessor import Preprocessor from ..utils.data_types import DataArray, DataList, DataObject from ..utils.sanity_checks import assert_not_complex +from .eof import EOF def _check_parameter_number(parameter_name: str, parameter, n_views: int): diff --git a/xeofs/models/rotator_factory.py b/xeofs/models/rotator_factory.py index 1b685d3..4abde46 100644 --- a/xeofs/models/rotator_factory.py +++ b/xeofs/models/rotator_factory.py @@ -1,7 +1,7 @@ from .eof import EOF, ComplexEOF +from .eof_rotator import ComplexEOFRotator, EOFRotator from .mca import MCA, ComplexMCA -from .eof_rotator import EOFRotator, ComplexEOFRotator -from .mca_rotator import MCARotator, ComplexMCARotator +from .mca_rotator import ComplexMCARotator, MCARotator class RotatorFactory: @@ -44,13 +44,13 @@ def create_rotator( """ # We need to check the type of the model instead of isinstance because # of inheritance. - if type(model) == EOF: + if type(model) is EOF: return EOFRotator(**self.params) - elif type(model) == ComplexEOF: + elif type(model) is ComplexEOF: return ComplexEOFRotator(**self.params) - elif type(model) == MCA: + elif type(model) is MCA: return MCARotator(**self.params) - elif type(model) == ComplexMCA: + elif type(model) is ComplexMCA: return ComplexMCARotator(**self.params) else: err_msg = f"Invalid model type. Valid types are {self._valid_types}." diff --git a/xeofs/models/sparse_pca.py b/xeofs/models/sparse_pca.py new file mode 100644 index 0000000..27c9299 --- /dev/null +++ b/xeofs/models/sparse_pca.py @@ -0,0 +1,357 @@ +# %% +from typing import Dict, Optional + +import numpy as np +import xarray as xr +from typing_extensions import Self + +from ..utils.data_types import DataArray, DataObject +from ..utils.sanity_checks import assert_not_complex +from ..utils.xarray_utils import get_matrix_rank +from ..utils.xarray_utils import total_variance as compute_total_variance +from ._base_model import _BaseModel +from ._np_classes._sparse_pca import compute_rspca, compute_spca + + +class SparsePCA(_BaseModel): + """ + Sparse PCA via Variable Projection. + + Given a data matrix, Sparse PCA via Variable Projection [1]_ computes a set of sparse components that can optimally reconstruct the input data. The amount of sparsity is controlled by the L1 penalty coefficient, specified by the parameter `alpha`. Additionally, ridge shrinkage can be applied to improve conditioning. + + This implementation uses randomized methods for linear algebra to accelerate computations. The quality of the approximation can be controlled via the oversampling parameter `oversample` and `n_subspace`, which specifies the number of subspace iterations. + + Variable Projection for Sparse PCA solves the following optimization problem: + + minimize :math:`\\frac{1}{2} \\| X - X B A^T \\|^2 + \\alpha \\|B\\|_1 + \\frac{1}{2} \\beta \\|B\\|^2` + + Parameters + ---------- + n_modes : int, default=2 + Number of modes to calculate. + alpha : float, default=0.001 + Sparsity controlling parameter. Higher values lead to sparser components. + beta : float, default=0.001 + Amount of ridge shrinkage to apply in order to improve conditioning. + regularizer : {'l0', 'l1'}, default='l1' + Type of sparsity-inducing regularizer. The L1 norm (also known as LASSO) + leads to a soft-threshold operator (default). The L0 norm is implemented + via a hard-threshold operator. + max_iter : int, default=500 + Maximum number of iterations to perform before exiting. + tol : float, default=1e-5 + Stopping tolerance for reconstruction error. + robust : bool, default=False + Use a robust algorithm to compute the sparse PCA. + oversample : int, default=10 + Controls the oversampling of column space. Increasing this parameter + may improve numerical accuracy. + n_subspace : int, default=2 + Parameter to control the number of subspace iterations. Increasing this + parameter may improve numerical accuracy. + n_blocks : int, default=2 + Parameter to control how many blocks of columns the input matrix + should be split into. A larger number requires less fast memory, but + increases computational time. + standardize : bool, default=False + Whether to standardize the input data, i.e., each feature will have a variance of 1. + use_coslat : bool, default=False + Whether to use cosine of latitude for scaling. + sample_name : str, default="sample" + Name of the sample dimension. + feature_name : str, default="feature" + Name of the feature dimension. + compute : bool, default=True + Whether to compute elements of the model eagerly, or to defer computation. + If True, the following steps will be computed sequentially: + 1) the preprocessor scaler, + 2) optional NaN checks, + 3) SVD decomposition, + 4) scores and components. + verbose : bool, default=False + Whether to show a progress bar when computing the decomposition. + random_state : Optional[int], default=None + Seed for the random number generator. + solver : {"auto", "full", "randomized"}, default="randomized" + Solver to use for the SVD computation. + solver_kwargs : dict, default={} + Additional keyword arguments to be passed to the SVD solver. + + References + ---------- + .. [1] Erichson, N. B. et al. Sparse Principal Component Analysis via Variable Projection. SIAM J. Appl. Math. 80, 977-1002 (2020). + + Notes + ----- + This implementation is adapted from the code provided by the authors of the paper [1]_, which is part of the Ristretto library (https://github.com/erichson/ristretto). + + Examples + -------- + >>> model = xe.models.SparsePCA(n_modes=2, alpha=1e-4) + >>> model.fit(data, "time") + >>> components = model.components() + """ + + def __init__( + self, + n_modes: int = 2, + alpha: float = 1e-3, + beta: float = 1e-3, + robust: bool = False, + regularizer: str = "l1", + max_iter: int = 500, + tol: float = 1e-6, + oversample: int = 10, + n_subspace: int = 1, + n_blocks: int = 1, + center: bool = True, + standardize: bool = False, + use_coslat: bool = False, + sample_name: str = "sample", + feature_name: str = "feature", + check_nans=True, + compute: bool = True, + verbose: bool = False, + random_state: Optional[int] = None, + solver: str = "auto", + solver_kwargs: Dict = {}, + **kwargs, + ): + super().__init__( + n_modes=n_modes, + center=center, + standardize=standardize, + use_coslat=use_coslat, + check_nans=check_nans, + sample_name=sample_name, + feature_name=feature_name, + compute=compute, + verbose=verbose, + random_state=random_state, + solver=solver, + solver_kwargs=solver_kwargs, + **kwargs, + ) + self.attrs.update({"model": "Sparse PCA"}) + self._params.update( + { + "alpha": alpha, + "beta": beta, + "robust": robust, + "regularizer": regularizer, + "max_iter": max_iter, + "tol": tol, + "oversample": oversample, + "n_subspace": n_subspace, + "n_blocks": n_blocks, + } + ) + + def _fit_algorithm(self, data: DataArray) -> Self: + sample_name = self.sample_name + feature_name = self.feature_name + + # Check if the data is real + # NOTE: Complex data is not supported, it's likely possible but current numpy implementation + # of sparse_pca needs to be adpated, mainly changing matrix transpose to conjugate transpose. + # http://arxiv.org/abs/1804.00341 + assert_not_complex(data) + + # Compute the total variance + total_variance = compute_total_variance(data, dim=sample_name) + + # Compute matrix rank + rank = get_matrix_rank(data) + + # Decide whether to use exact or randomized algorithm + is_small_data = max(data.shape) < 500 + solver = self._params["solver"] + + match solver: + case "auto": + use_exact = ( + True if is_small_data and self.n_modes > int(0.8 * rank) else False + ) + case "full": + use_exact = True + case "randomized": + use_exact = False + case _: + raise ValueError( + f"Unrecognized solver '{solver}'. " + "Valid options are 'auto', 'full', and 'randomized'." + ) + + decomposing_kwargs = dict( + n_components=self.n_modes, + alpha=self._params["alpha"], + beta=self._params["beta"], + robust=self._params["robust"], + regularizer=self._params["regularizer"], + max_iter=self._params["max_iter"], + tol=self._params["tol"], + compute=self._params["compute"], + ) + if use_exact: + decomposing_algorithm = compute_spca + else: + decomposing_algorithm = compute_rspca + decomposing_kwargs.update( + dict( + oversample=self._params["oversample"], + n_subspace=self._params["n_subspace"], + n_blocks=self._params["n_blocks"], + random_state=self._params["random_state"], + ) + ) + + # Fit the data + # We obtain the follwing outputs defined in Erichson et al. 2020 + # variable : notation used by Erichson et al. + # components : sparse weight matrix B + # components_normal : orthonormal matrix A + # exp_var : eigenvalues + components, components_normal, exp_var = xr.apply_ufunc( + decomposing_algorithm, + data, + input_core_dims=[[sample_name, feature_name]], + output_core_dims=[[feature_name, "mode"], [feature_name, "mode"], ["mode"]], + dask="allowed", + kwargs=decomposing_kwargs, + ) + + # Add coordinates to the results + exp_var.name = "explained_variance" + exp_var = exp_var.assign_coords({"mode": np.arange(1, self.n_modes + 1)}) + + components.name = "sparse_weight_vectors" + components = components.assign_coords( + { + feature_name: data.coords[feature_name], + "mode": np.arange(1, self.n_modes + 1), + }, + ) + + components_normal.name = "orthonormal_weight_vectors" + components_normal = components_normal.assign_coords( + { + feature_name: data.coords[feature_name], + "mode": np.arange(1, self.n_modes + 1), + }, + ) + + # Transform the data + scores = xr.dot(data, components, dims=feature_name) + scores.name = "scores" + + norms = xr.apply_ufunc( + np.linalg.norm, + scores, + input_core_dims=[[sample_name]], + output_core_dims=[[]], + exclude_dims=set((sample_name,)), + kwargs={"axis": -1}, + dask="allowed", + vectorize=False, + ) + norms.name = "component_norms" + + # Store the results + self.data.add(data, "input_data", allow_compute=False) + self.data.add(components, "components") + self.data.add(components_normal, "components_normal") + self.data.add(scores, "scores") + self.data.add(norms, "norms") + self.data.add(exp_var, "explained_variance") + self.data.add(total_variance, "total_variance") + + self.data.set_attrs(self.attrs) + return self + + def _transform_algorithm(self, data: DataObject) -> DataArray: + feature_name = self.preprocessor.feature_name + + components = self.data["components"] + + # Project the data + projections = xr.dot(data, components, dims=feature_name) + projections.name = "scores" + + return projections + + def _inverse_transform_algorithm(self, scores: DataArray) -> DataArray: + # Reconstruct the data + comps = self.data["components_normal"].sel(mode=scores.mode) + + reconstructed_data = xr.dot(comps.conj(), scores, dims="mode") + reconstructed_data.name = "reconstructed_data" + + return reconstructed_data + + def components(self) -> DataObject: + """Return the sparse components. + + The components are given by the sparse weight matrix B in the optimization problem. + + Returns + ------- + components: DataArray | Dataset | List[DataArray] + Components of the fitted model. + + """ + return super().components() + + def scores(self, normalized: bool = True) -> DataArray: + """Return the component scores. + + The component scores :math:`U` are defined as the projection of the fitted + data :math:`X` onto the sparse weight components :math:`B`. + + .. math:: + U = X B + + + Parameters + ---------- + normalized : bool, default=True + Whether to normalize the scores by the L2 norm. + + Returns + ------- + components: DataArray | Dataset | List[DataArray] + Scores of the fitted model. + + """ + return super().scores(normalized=normalized) + + def explained_variance(self) -> DataArray: + """Return the explained variance. + + + Returns + ------- + explained_variance: DataArray + Explained variance. + """ + return self.data["explained_variance"] + + def explained_variance_ratio(self) -> DataArray: + """Return explained variance ratio. + + The explained variance ratio :math:`\\gamma_i` is the variance explained + by each mode normalized by the total variance. It is defined as + + .. math:: + \\gamma_i = \\frac{\\lambda_i}{\\sum_{j=1}^M \\lambda_j} + + where :math:`\\lambda_i` is the explained variance of the :math:`i`-th mode and :math:`M` is the total number of modes. + + Returns + ------- + explained_variance_ratio: DataArray + Explained variance ratio. + """ + exp_var_ratio = self.data["explained_variance"] / self.data["total_variance"] + exp_var_ratio.attrs.update(self.data["explained_variance"].attrs) + exp_var_ratio.name = "explained_variance_ratio" + return exp_var_ratio diff --git a/xeofs/utils/xarray_utils.py b/xeofs/utils/xarray_utils.py index 8b169e0..fcbb2e1 100644 --- a/xeofs/utils/xarray_utils.py +++ b/xeofs/utils/xarray_utils.py @@ -1,20 +1,20 @@ -from typing import Sequence, Hashable, Tuple, TypeVar, List, Any +from typing import Any, Hashable, List, Sequence, Tuple, TypeVar import numpy as np import xarray as xr -from .sanity_checks import convert_to_dim_type +from .constants import VALID_LATITUDE_NAMES from .data_types import ( - Dims, - DimsList, DaskArray, Data, - DataVar, DataArray, - DataSet, DataList, + DataSet, + DataVar, + Dims, + DimsList, ) -from .constants import VALID_LATITUDE_NAMES +from .sanity_checks import convert_to_dim_type T = TypeVar("T") @@ -133,7 +133,6 @@ def compute_sqrt_cos_lat_weights(data: DataVar, feature_dims: Dims) -> DataVar: for var, da in data.data_vars.items() } ) - else: raise TypeError( "Invalid input type: {:}. Expected one of the following: DataArray".format( @@ -330,3 +329,25 @@ def argsort_dask(data: Data, dim: str) -> Data: output_core_dims=[[dim]], dask="parallelized", ) + + +def get_matrix_rank(data: DataArray) -> int: + """Compute the rank of a 2D matrix. + + We use the imprecice but practical assumption for real-world data + that the rank of the matrix is the minimum of the number of rows and columns. + + Parameters: + ------------ + data: DataArray + Input data. + + Returns: + --------- + rank: int + Rank of the input matrix. + + """ + if len(data.dims) != 2: + raise ValueError("Input data must be a 2D matrix.") + return min(data.shape)