Replace rmm::mr::device_memory_resource* with rmm::device_async_resou…

…rce_ref (#1373) Closes #1371 Replaces all occurrences of `rmm::mr::device_memory_resource*` in parameters with `rmm::device_async_resource_ref`. Also updates guidance in the DEVELOPER_GUIDE.md Authors: - Mark Harris (https:/harrism) Approvers: - Paul Taylor (https:/trxcllnt) - Michael Wang (https:/isVoid) URL: #1373
rapidsai · Apr 18, 2024 · 76762fc · 76762fc
1 parent c5203dc
commit 76762fc
Show file tree

Hide file tree

Showing 58 changed files with 281 additions and 193 deletions.
diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -84,7 +84,7 @@ Examples:
 
 ```c++
 template <typename IteratorType>
-void algorithm_function(int x, rmm::cuda_stream_view s, rmm::device_memory_resource* mr)
+void algorithm_function(int x, rmm::cuda_stream_view s, rmm::device_async_resource_ref mr)
 {
  ...
 }
@@ -233,9 +233,10 @@ std::unique_ptr<cudf::table> points_in_spatial_window(
  cudf::column_view const& y);
 ```
 
-## RMM Memory Resources (`rmm::device_memory_resource`)
+## Memory Resources (`rmm::device_memory_resource`)
 
-libcuspatial allocates all device memory via RMM memory resources (MR). See the
+libcuspatial allocates all device memory via RMM memory resources (MR) or CUDA MRs. Either type
+can be passed to libcuspatial functions via `rmm::device_async_resource_ref` parameters. See the
 [RMM documentation](https:/rapidsai/rmm/blob/main/README.md) for details.
 
 ### Current Device Memory Resource
@@ -245,6 +246,27 @@ RMM provides a "default" memory resource for each device that can be accessed an
 respectively. All memory resource parameters should be defaulted to use the return value of
 `rmm::mr::get_current_device_resource()`.
 
+### Resource Refs
+
+Memory resources are passed via resource ref parameters. A resource ref is memory resource wrapper
+that enables consumers to specify properties of resources that they expect. These are defined
+in the `cuda::mr` namespace of libcu++, but RMM provides some convenience wrappers in
+`rmm/resource_ref.hpp`:
+ - `rmm::device_resource_ref` accepts a memory resource that provides synchronous allocation
+ of device-accessible memory.
+ - `rmm::device_async_resource_ref` accepts a memory resource that provides stream-ordered allocation
+ of device-accessible memory.
+ - `rmm::host_resource_ref` accepts a memory resource that provides synchronous allocation of host- 
+ accessible memory.
+ - `rmm::host_async_resource_ref` accepts a memory resource that provides stream-ordered allocation
+ of host-accessible memory.
+ - `rmm::host_device_resource_ref` accepts a memory resource that provides synchronous allocation of
+ host- and device-accessible memory.
+ - `rmm::host_device_async_resource_ref` accepts a memory resource that provides stream-ordered
+ allocation of host- and device-accessible memory.
+
+See the libcu++ [docs on `resource_ref`](https://nvidia.github.io/cccl/libcudacxx/extended_api/memory_resource/resource_ref.html) for more information.
+
 # libcuspatial API and Implementation
 
 This section provides specifics about the structure and implementation of cuSpatial API functions.
@@ -439,8 +461,8 @@ There are a few key points to notice.
  cuSpatial APIs will not need to use this returned iterator.
  9. All APIs that run CUDA device code (including Thrust algorithms) or allocate memory take a CUDA
  stream on which to execute the device code and allocate memory.
- 10. Any API that allocate and return device data (not shown here) should also take an
- `rmm::device_memory_resource` to use for output memory allocation.
+ 10. Any API that allocates and returns device data (not shown here) should also take an
+ `rmm::device_async_resource_ref` to use for output memory allocation.
 
 ### (Multiple) Return Values
 
@@ -542,22 +564,28 @@ control how device memory is allocated.
 
 ### Output Memory
 
-Any libcuspatial API that allocates memory that is *returned* to a user must accept a pointer to a
-`device_memory_resource` as the last parameter. Inside the API, this memory resource must be used
-to allocate any memory for returned objects. It should therefore be passed into functions whose
-outputs will be returned. Example:
+Any libcuspatial API that allocates memory that is *returned* to a user must accept a
+`rmm::device_async_resource_ref` as the last parameter. Inside the API, this memory resource must
+be used to allocate any memory for returned objects. It should therefore be passed into functions
+whose outputs will be returned. Example:
 
 ```c++
 // Returned `column` contains newly allocated memory,
 // therefore the API must accept a memory resource pointer
 std::unique_ptr<column> returns_output_memory(
- ..., rmm::device_memory_resource * mr = rmm::mr::get_current_device_resource());
+ ..., rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 // This API does not allocate any new *output* memory, therefore
 // a memory resource is unnecessary
 void does_not_allocate_output_memory(...);
 ```
 
+This rule automatically applies to all detail APIs that allocate memory. Any detail API may be
+called by any public API, and therefore could be allocating memory that is returned to the user.
+To support such uses cases, all detail APIs allocating memory resources should accept an `mr`
+parameter. Callers are responsible for either passing through a provided `mr` or
+`rmm::mr::get_current_device_resource()` as needed.
+
 ### Temporary Memory
 
 Not all memory allocated within a libcuspatial API is returned to the caller. Often algorithms must
@@ -566,7 +594,7 @@ obtained from `rmm::mr::get_current_device_resource()` for temporary memory allo
 
 ```c++
 rmm::device_buffer some_function(
- ..., rmm::mr::device_memory_resource mr * = rmm::mr::get_current_device_resource()) {
+ ..., rmm::mr::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) {
  rmm::device_buffer returned_buffer(..., mr); // Returned buffer uses the passed in MR
  ...
  rmm::device_buffer temporary_buffer(...); // Temporary buffer uses default MR
@@ -578,12 +606,12 @@ rmm::device_buffer some_function(
 ### Memory Management
 
 libcuspatial code eschews raw pointers and direct memory allocation. Use RMM classes built to
-use [`device_memory_resource`](https:/rapidsai/rmm/#device_memory_resource) for device
+use [memory resources](https:/rapidsai/rmm/#device_memory_resource) for device
 memory allocation with automated lifetime management.
 
 #### rmm::device_buffer
 Allocates a specified number of bytes of untyped, uninitialized device memory using a
-`device_memory_resource`. If no resource is explicitly provided, uses
+memory resource. If no `rmm::device_async_resource_ref` is explicitly provided, uses
 `rmm::mr::get_current_device_resource()`.
 
 `rmm::device_buffer` is movable and copyable on a stream. A copy performs a deep copy of the

diff --git a/cpp/include/cuspatial/bounding_boxes.hpp b/cpp/include/cuspatial/bounding_boxes.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -53,7 +54,7 @@ std::unique_ptr<cudf::table> linestring_bounding_boxes(
  cudf::column_view const& x,
  cudf::column_view const& y,
  double expansion_radius,
- rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+ rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Compute minimum bounding box for each polygon in a list.
@@ -80,8 +81,8 @@ std::unique_ptr<cudf::table> polygon_bounding_boxes(
  cudf::column_view const& ring_offsets,
  cudf::column_view const& x,
  cudf::column_view const& y,
- double expansion_radius  = 0.0,
- rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+ double expansion_radius = 0.0,
+ rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @} // end of doxygen group

diff --git a/cpp/include/cuspatial/detail/index/construction/phase_1.cuh b/cpp/include/cuspatial/detail/index/construction/phase_1.cuh
@@ -24,6 +24,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/copy.h>
@@ -65,7 +66,7 @@ compute_point_keys_and_sorted_indices(PointIt points_first,
  T scale,
  int8_t max_depth,
  rmm::cuda_stream_view stream,
- rmm::mr::device_memory_resource* mr)
+ rmm::device_async_resource_ref mr)
 {
  auto num_points = thrust::distance(points_first, points_last);
  rmm::device_uvector<uint32_t> keys(num_points, stream);
@@ -259,7 +260,7 @@ inline auto make_full_levels(PointIt points_first,
  T scale,
  int8_t max_depth,
  rmm::cuda_stream_view stream,
- rmm::mr::device_memory_resource* mr)
+ rmm::device_async_resource_ref mr)
 {
  auto num_points = thrust::distance(points_first, points_last);
  // Compute point keys and sort into bottom-level quadrants

diff --git a/cpp/include/cuspatial/detail/index/construction/phase_2.cuh b/cpp/include/cuspatial/detail/index/construction/phase_2.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/count.h>
@@ -309,7 +310,7 @@ inline rmm::device_uvector<bool> construct_non_leaf_indicator(
  int32_t num_parent_nodes,
  int32_t num_valid_nodes,
  int32_t max_size,
- rmm::mr::device_memory_resource* mr,
+ rmm::device_async_resource_ref mr,
  rmm::cuda_stream_view stream)
 {
  //

diff --git a/cpp/include/cuspatial/detail/intersection/linestring_intersection.cuh b/cpp/include/cuspatial/detail/intersection/linestring_intersection.cuh
@@ -34,6 +34,7 @@
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/atomic>
 #include <thrust/binary_search.h>
@@ -142,7 +143,7 @@ std::unique_ptr<rmm::device_uvector<types_t>> compute_types_buffer(
  OffsetRangeB points_offset,
  OffsetRangeB segments_offset,
  rmm::cuda_stream_view stream,
- rmm::mr::device_memory_resource* mr)
+ rmm::device_async_resource_ref mr)
 {
  auto types_buffer = std::make_unique<rmm::device_uvector<types_t>>(union_column_size, stream, mr);
  thrust::tabulate(rmm::exec_policy(stream),
@@ -162,7 +163,7 @@ std::unique_ptr<rmm::device_uvector<types_t>> compute_types_buffer(
 template <typename index_t, typename types_t>
 std::unique_ptr<rmm::device_uvector<index_t>> compute_offset_buffer(
  rmm::device_uvector<types_t> const& types_buffer,
- rmm::mr::device_memory_resource* mr,
+ rmm::device_async_resource_ref mr,
  rmm::cuda_stream_view stream)
 {
  auto N = types_buffer.size();
@@ -202,7 +203,7 @@ template <typename T,
 linestring_intersection_result<T, index_t> pairwise_linestring_intersection(
  MultiLinestringRange1 multilinestrings1,
  MultiLinestringRange2 multilinestrings2,
- rmm::mr::device_memory_resource* mr,
+ rmm::device_async_resource_ref mr,
  rmm::cuda_stream_view stream)
 {
  using types_t = typename linestring_intersection_result<T, index_t>::types_t;

diff --git a/cpp/include/cuspatial/detail/intersection/linestring_intersection_with_duplicates.cuh b/cpp/include/cuspatial/detail/intersection/linestring_intersection_with_duplicates.cuh
@@ -29,6 +29,7 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
@@ -228,7 +229,7 @@ struct linestring_intersection_intermediates {
  /** @brief Construct a zero-pair, zero-geometry intermediate object
  */
  linestring_intersection_intermediates(rmm::cuda_stream_view stream,
- rmm::mr::device_memory_resource* mr)
+ rmm::device_async_resource_ref mr)
  : offsets(std::make_unique<rmm::device_uvector<IndexType>>(1, stream)),
  geoms(std::make_unique<rmm::device_uvector<GeomType>>(0, stream, mr)),
  lhs_linestring_ids(std::make_unique<rmm::device_uvector<IndexType>>(0, stream)),
@@ -244,7 +245,7 @@ struct linestring_intersection_intermediates {
  std::size_t num_geoms,
  rmm::device_uvector<IndexType> const& num_geoms_per_pair,
  rmm::cuda_stream_view stream,
- rmm::mr::device_memory_resource* mr)
+ rmm::device_async_resource_ref mr)
  : offsets(std::make_unique<rmm::device_uvector<IndexType>>(num_pairs + 1, stream)),
  geoms(std::make_unique<rmm::device_uvector<GeomType>>(num_geoms, stream, mr)),
  lhs_linestring_ids(std::make_unique<rmm::device_uvector<IndexType>>(num_geoms, stream)),
@@ -472,7 +473,7 @@ std::pair<linestring_intersection_intermediates<vec_2d<T>, index_t>,
  linestring_intersection_intermediates<segment<T>, index_t>>
 pairwise_linestring_intersection_with_duplicates(MultiLinestringRange1 multilinestrings1,
  MultiLinestringRange2 multilinestrings2,
- rmm::mr::device_memory_resource* mr,
+ rmm::device_async_resource_ref mr,
  rmm::cuda_stream_view stream)
 {
  static_assert(std::is_integral_v<index_t>, "Index type must be integral.");

diff --git a/cpp/include/cuspatial/detail/join/quadtree_bbox_filtering.cuh b/cpp/include/cuspatial/detail/join/quadtree_bbox_filtering.cuh
@@ -22,6 +22,8 @@
 #include <cuspatial/point_quadtree.cuh>
 #include <cuspatial/traits.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <cuda/functional>
 #include <thrust/iterator/discard_iterator.h>
 
@@ -39,7 +41,7 @@ join_quadtree_and_bounding_boxes(point_quadtree_ref quadtree,
  T scale,
  int8_t max_depth,
  rmm::cuda_stream_view stream,
- rmm::mr::device_memory_resource* mr)
+ rmm::device_async_resource_ref mr)
 {
  static_assert(is_same<T, cuspatial::iterator_vec_base_type<BoundingBoxIterator>>(),
  "Iterator value_type mismatch");

diff --git a/cpp/include/cuspatial/detail/join/quadtree_point_in_polygon.cuh b/cpp/include/cuspatial/detail/join/quadtree_point_in_polygon.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/permutation_iterator.h>
 #include <thrust/scan.h>
@@ -108,7 +109,7 @@ std::pair<rmm::device_uvector<IndexType>, rmm::device_uvector<IndexType>> quadtr
  PointIterator points_first,
  MultiPolygonRange polygons,
  rmm::cuda_stream_view stream,
- rmm::mr::device_memory_resource* mr)
+ rmm::device_async_resource_ref mr)
 {
  using T = iterator_vec_base_type<PointIterator>;
 

diff --git a/cpp/include/cuspatial/detail/join/quadtree_point_to_nearest_linestring.cuh b/cpp/include/cuspatial/detail/join/quadtree_point_to_nearest_linestring.cuh
@@ -27,6 +27,7 @@
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/detail/raw_reference_cast.h>
@@ -156,7 +157,7 @@ quadtree_point_to_nearest_linestring(LinestringIndexIterator linestring_indices_
  PointIterator points_first,
  MultiLinestringRange linestrings,
  rmm::cuda_stream_view stream,
- rmm::mr::device_memory_resource* mr)
+ rmm::device_async_resource_ref mr)
 {
  CUSPATIAL_EXPECTS(linestrings.num_multilinestrings() == linestrings.num_linestrings(),
  "Only one linestring per multilinestring currently supported.");

diff --git a/cpp/include/cuspatial/detail/point_quadtree.cuh b/cpp/include/cuspatial/detail/point_quadtree.cuh
@@ -24,6 +24,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/distance.h>
@@ -48,7 +49,7 @@ inline point_quadtree make_quad_tree(rmm::device_uvector<uint32_t>& keys,
  int32_t max_size,
  int32_t level_1_size,
  rmm::cuda_stream_view stream,
- rmm::mr::device_memory_resource* mr)
+ rmm::device_async_resource_ref mr)
 {
  // count the number of child nodes
  auto num_child_nodes = thrust::reduce(rmm::exec_policy(stream),
@@ -155,7 +156,7 @@ inline point_quadtree make_leaf_tree(rmm::device_uvector<uint32_t>& keys,
  rmm::device_uvector<uint32_t>& lengths,
  int32_t num_top_quads,
  rmm::cuda_stream_view stream,
- rmm::mr::device_memory_resource* mr)
+ rmm::device_async_resource_ref mr)
 {
  rmm::device_uvector<uint8_t> levels(num_top_quads, stream, mr);
  rmm::device_uvector<bool> is_internal_node(num_top_quads, stream, mr);
@@ -195,7 +196,7 @@ inline std::pair<rmm::device_uvector<uint32_t>, point_quadtree> construct_quadtr
  T scale,
  int8_t max_depth,
  int32_t max_size,
- rmm::mr::device_memory_resource* mr,
+ rmm::device_async_resource_ref mr,
  rmm::cuda_stream_view stream)
 {
  // Construct the full set of non-empty subquadrants starting from the lowest level.
@@ -243,7 +244,7 @@ std::pair<rmm::device_uvector<uint32_t>, point_quadtree> quadtree_on_points(
  int8_t max_depth,
  int32_t max_size,
  rmm::cuda_stream_view stream,
- rmm::mr::device_memory_resource* mr)
+ rmm::device_async_resource_ref mr)
 {
  auto num_points = thrust::distance(points_first, points_last);
  if (num_points <= 0) {