Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change default for window size in EquivalentSourcesGB #487

Merged
merged 15 commits into from
May 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 42 additions & 10 deletions harmonica/_equivalent_sources/gradient_boosted.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
"""
Gradient-boosted equivalent sources in Cartesian coordinates
"""
import warnings

import numpy as np
import verde.base as vdb
from sklearn import utils
Expand Down Expand Up @@ -60,12 +62,13 @@ class EquivalentSourcesGB(EquivalentSources):
If None, no block-averaging is applied.
This parameter is ignored if *points* are specified.
Default to None.
window_size : float
window_size : float or "default"
Size of overlapping windows used during the gradient-boosting
algorithm. Smaller windows reduce the memory requirements of the source
coefficients fitting process. Very small windows may impact on the
accuracy of the interpolations.
Defaults to 5000.
Defaults to estimating a window size such that approximately 5000 data
points are in each window.
parallel : bool
If True any predictions and Jacobian building is carried out in
parallel through Numba's ``jit.prange``, reducing the computation time.
Expand All @@ -84,6 +87,11 @@ class EquivalentSourcesGB(EquivalentSources):
The boundaries (``[W, E, S, N]``) of the data used to fit the
interpolator. Used as the default region for the
:meth:`~harmonica.EquivalentSources.grid` method.
window_size_ : float or None
Size of the overlapping windows used in gradient-boosting equivalent
point sources. It will be set to None if ``window_size = "default"``
and less than 5000 data points were used to fit the sources; a single
window will be used in such case.

References
----------
Expand All @@ -99,11 +107,16 @@ def __init__(
points=None,
depth=500,
block_size=None,
window_size=5e3,
window_size="default",
parallel=True,
random_state=None,
dtype="float64",
):
if isinstance(window_size, str) and window_size != "default":
raise ValueError(
f"Found invalid 'window_size' value equal to '{window_size}'."
"It should be 'default' or a numeric value."
)
super().__init__(
damping=damping,
points=points,
Expand Down Expand Up @@ -274,7 +287,7 @@ def _create_windows(self, coordinates, shuffle=True):
coordinates : tuple
Arrays with the coordinates of each data point. Should be in the
following order: (``easting``, ``northing``, ``upward``).
shuffle_windows : bool
shuffle : bool
Enable or disable the random shuffling of windows order. It's is
highly recommended to enable shuffling for better fitting results.
This argument is mainly included for testing purposes. Default to
Expand All @@ -289,21 +302,40 @@ def _create_windows(self, coordinates, shuffle=True):
the same as the one in ``data_windows_nonempty``.
data_windows_nonempty : list
List containing arrays with the indices of the data points that
under each window. The order of the windows is randomly shuffled if
``shuffle_windows`` is True, although the order of the windows is
the same as the one in ``source_windows_nonempty``.
fall under each window. The order of the windows is randomly
shuffled if ``shuffle_windows`` is True, although the order of the
windows is the same as the one in ``source_windows_nonempty``.
"""
# Compute window spacing based on overlapping
window_spacing = self.window_size * (1 - self.overlapping)

# Get the region that contains every data point and every source
region = _get_region_data_sources(coordinates, self.points_)
# Calculate the window size such that there are approximately 5000 data
# points in each window. Otherwise use the given window size.
if self.window_size == "default":
area = (region[1] - region[0]) * (region[3] - region[2])
ndata = coordinates[0].size
if ndata <= 5e3:
warnings.warn(
f"Found {ndata} number of coordinates (<= 5e3). Only one window will be used."
)
source_windows_nonempty = [np.arange(self.points_[0].size)]
data_windows_nonempty = [np.arange(ndata)]
self.window_size_ = None
return source_windows_nonempty, data_windows_nonempty
points_per_m2 = ndata / area
window_area = 5e3 / points_per_m2
self.window_size_ = np.sqrt(window_area)
else:
self.window_size_ = self.window_size
# Compute window spacing based on overlapping
window_spacing = self.window_size_ * (1 - self.overlapping)
# The windows for sources and data points are the same, but the
# verde.rolling_window function creates indices for the given
# coordinates. That's why we need to create two set of window indices:
# one for the sources and one for the data points.
# We pass the same region, size and spacing to be sure that both set of
# windows are the same.
kwargs = dict(region=region, size=self.window_size, spacing=window_spacing)
kwargs = dict(region=region, size=self.window_size_, spacing=window_spacing)
_, source_windows = rolling_window(self.points_, **kwargs)
_, data_windows = rolling_window(coordinates, **kwargs)
# Ravel the indices
Expand Down
37 changes: 37 additions & 0 deletions harmonica/tests/test_gradient_boosted_eqs.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,3 +373,40 @@ def test_error_ignored_args(coordinates_small, data_small, region):
msg = "The 'bla' arguments are being ignored."
with pytest.warns(FutureWarning, match=msg):
eqs.grid(coordinates=grid_coords, bla="bla")


def test_window_size_less_than_5000():
region = (0, 10e3, -5e3, 5e3)
grid_coords = vd.grid_coordinates(region=region, shape=(64, 64), extra_coords=0)
grid_coords = [c.ravel() for c in grid_coords]
eqs = EquivalentSourcesGB()
eqs.points_ = eqs._build_points(
grid_coords
) # need to build sources first before creating windows.
with pytest.warns(UserWarning, match=f"Found {64**2} number of coordinates"):
source_windows, data_windows = eqs._create_windows(grid_coords)
assert eqs.window_size_ is None
assert len(source_windows) == 1
assert len(data_windows) == 1
# Check if all sources and data points are inside the window
for coord in eqs.points_:
npt.assert_allclose(coord, coord[source_windows[0]])
for coord in grid_coords:
npt.assert_allclose(coord, coord[data_windows[0]])


def test_window_size():
region = (0, 10e3, -5e3, 5e3)
grid_coords = vd.grid_coordinates(region=region, shape=(100, 100), extra_coords=0)
eqs = EquivalentSourcesGB()
eqs.points_ = eqs._build_points(
grid_coords
) # need to build sources first before creating windows.
eqs._create_windows(grid_coords)
expected_window_size = np.sqrt(5e3 / (100**2 / 10e3**2))
npt.assert_allclose(eqs.window_size_, expected_window_size)


def test_invalid_window_size():
with pytest.raises(ValueError, match="Found invalid 'window_size' value equal to"):
EquivalentSourcesGB(window_size="Chuckie took my soul!")