spcl · tbennun · Dec 18, 2023 · Nov 21, 2023 · Nov 21, 2023 · Nov 17, 2023
diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
@@ -1132,10 +1132,22 @@ def _emit_copy(self, state_id, src_node, src_storage, dst_node, dst_storage, dst
  func=funcname,
  type=dst_node.desc(sdfg).dtype.ctype,
  bdims=', '.join(_topy(self._block_dims)),
- is_async='true' if state_dfg.out_degree(dst_node) > 0 else 'true',
+ is_async='true' if state_dfg.out_degree(dst_node) > 0 else 'false',
  accum=accum,
  args=', '.join([src_expr] + _topy(src_strides) + [dst_expr] + custom_reduction +
  _topy(dst_strides) + _topy(copy_shape))), sdfg, state_id, [src_node, dst_node])
+ elif funcname == 'dace::SharedToGlobal1D':
+ # special case: use a new template struct that provides functions for copy and reduction
+ callsite_stream.write(
+ (' {func}<{type}, {bdims}, {copysize}, {is_async}>{accum}({args});').format(
+ func=funcname,
+ type=dst_node.desc(sdfg).dtype.ctype,
+ bdims=', '.join(_topy(self._block_dims)),
+ copysize=', '.join(_topy(copy_shape)),
+ is_async='true' if state_dfg.out_degree(dst_node) > 0 else 'false',
+ accum=accum or '::Copy',
+ args=', '.join([src_expr] + _topy(src_strides) + [dst_expr] + _topy(dst_strides) + custom_reduction)), sdfg,
+ state_id, [src_node, dst_node])
  else:
  callsite_stream.write(
  (' {func}<{type}, {bdims}, {copysize}, ' +
@@ -1145,7 +1157,7 @@ def _emit_copy(self, state_id, src_node, src_storage, dst_node, dst_storage, dst
  bdims=', '.join(_topy(self._block_dims)),
  copysize=', '.join(_topy(copy_shape)),
  dststrides=', '.join(_topy(dst_strides)),
- is_async='true' if state_dfg.out_degree(dst_node) > 0 else 'true',
+ is_async='true' if state_dfg.out_degree(dst_node) > 0 else 'false',
  accum=accum,
  args=', '.join([src_expr] + _topy(src_strides) + [dst_expr] + custom_reduction)), sdfg,
  state_id, [src_node, dst_node])

diff --git a/dace/runtime/include/dace/cuda/copy.cuh b/dace/runtime/include/dace/cuda/copy.cuh
@@ -736,58 +736,75 @@ namespace dace
  int COPY_XLEN, bool ASYNC>
  struct SharedToGlobal1D
  {
+ static constexpr int BLOCK_SIZE = BLOCK_WIDTH * BLOCK_HEIGHT * BLOCK_DEPTH;
+ static constexpr int TOTAL = COPY_XLEN;
+ static constexpr int WRITES = TOTAL / BLOCK_SIZE;
+ static constexpr int REM_WRITES = TOTAL % BLOCK_SIZE;
+
+ static DACE_DFI void Copy(const T *smem, int src_xstride, T *ptr, int dst_xstride)
+ {
+ if (!ASYNC)
+ __syncthreads();
+
+ // Linear thread ID
+ int ltid = GetLinearTID<BLOCK_WIDTH, BLOCK_HEIGHT, BLOCK_DEPTH>();
+
+ #pragma unroll
+ for (int i = 0; i < WRITES; ++i) {
+ *(ptr + (ltid + i * BLOCK_SIZE) * dst_xstride) =
+ *(smem + (ltid + i * BLOCK_SIZE) * src_xstride);
+ }
+
+ if (REM_WRITES != 0 && ltid < REM_WRITES) {
+ *(ptr + (ltid + WRITES*BLOCK_SIZE)* dst_xstride) =
+ *(smem + (ltid + WRITES * BLOCK_SIZE) * src_xstride);
+ }
+ }
+
  template <typename WCR>
- static DACE_DFI void Accum(const T *smem, int src_xstride, T *ptr, int DST_XSTRIDE, WCR wcr)
+ static DACE_DFI void Accum(const T *smem, int src_xstride, T *ptr, int dst_xstride, WCR wcr)
  {
  if (!ASYNC)
  __syncthreads();
 
  // Linear thread ID
  int ltid = GetLinearTID<BLOCK_WIDTH, BLOCK_HEIGHT, BLOCK_DEPTH>();
- constexpr int BLOCK_SIZE = BLOCK_WIDTH * BLOCK_HEIGHT * BLOCK_DEPTH;
- constexpr int TOTAL = COPY_XLEN;
- constexpr int WRITES = TOTAL / BLOCK_SIZE;
- constexpr int REM_WRITES = TOTAL % BLOCK_SIZE;
 
  #pragma unroll
  for (int i = 0; i < WRITES; ++i) {
  wcr_custom<T>::template reduce(
- wcr, ptr + (ltid + i * BLOCK_SIZE) * DST_XSTRIDE,
+ wcr, ptr + (ltid + i * BLOCK_SIZE) * dst_xstride,
  *(smem + (ltid + i * BLOCK_SIZE) * src_xstride));
  }
 
  if (REM_WRITES != 0) {
  if (ltid < REM_WRITES)
  wcr_custom<T>::template reduce(
- ptr + (ltid + WRITES * BLOCK_SIZE)* DST_XSTRIDE,
+ ptr + (ltid + WRITES * BLOCK_SIZE)* dst_xstride,
  *(smem + (ltid + WRITES * BLOCK_SIZE) * src_xstride));
  }
  }
 
  template <ReductionType REDTYPE>
- static DACE_DFI void Accum(const T *smem, int src_xstride, T *ptr, int DST_XSTRIDE)
+ static DACE_DFI void Accum(const T *smem, int src_xstride, T *ptr, int dst_xstride)
  {
  if (!ASYNC)
  __syncthreads();
 
  // Linear thread ID
  int ltid = GetLinearTID<BLOCK_WIDTH, BLOCK_HEIGHT, BLOCK_DEPTH>();
- constexpr int BLOCK_SIZE = BLOCK_WIDTH * BLOCK_HEIGHT * BLOCK_DEPTH;
- constexpr int TOTAL = COPY_XLEN;
- constexpr int WRITES = TOTAL / BLOCK_SIZE;
- constexpr int REM_WRITES = TOTAL % BLOCK_SIZE;
 
  #pragma unroll
  for (int i = 0; i < WRITES; ++i) {
  wcr_fixed<REDTYPE, T>::template reduce_atomic(
- ptr + (ltid + i * BLOCK_SIZE) * DST_XSTRIDE,
+ ptr + (ltid + i * BLOCK_SIZE) * dst_xstride,
  *(smem + (ltid + i * BLOCK_SIZE) * src_xstride));
  }
 
  if (REM_WRITES != 0) {
  if (ltid < REM_WRITES)
  wcr_fixed<REDTYPE, T>::template reduce_atomic(
- ptr + (ltid + WRITES*BLOCK_SIZE)* DST_XSTRIDE,
+ ptr + (ltid + WRITES*BLOCK_SIZE)* dst_xstride,
  *(smem + (ltid + WRITES * BLOCK_SIZE) * src_xstride));
  }
  }

diff --git a/tests/codegen/cuda_memcopy_test.py b/tests/codegen/cuda_memcopy_test.py
@@ -0,0 +1,84 @@
+""" Tests code generation for array copy on GPU target. """
+import dace
+from dace.transformation.auto import auto_optimize
+
+import pytest
+import re
+
+# this test requires cupy module
+cp = pytest.importorskip("cupy")
+
+# initialize random number generator
+rng = cp.random.default_rng(42)
+
+
+@pytest.mark.gpu
+def test_gpu_shared_to_global_1D():
+ M = 32
+ N = dace.symbol('N')
+
+ @dace.program
+ def transpose_shared_to_global(A: dace.float64[M, N], B: dace.float64[N, M]):
+ for i in dace.map[0:N]:
+ local_gather = dace.define_local([M], A.dtype, storage=dace.StorageType.GPU_Shared)
+ for j in dace.map[0:M]:
+ local_gather[j] = A[j, i]
+ B[i, :] = local_gather
+
+
+ sdfg = transpose_shared_to_global.to_sdfg()
+ auto_optimize.apply_gpu_storage(sdfg)
+
+ size_M = M
+ size_N = 128
+
+ A = rng.random((size_M, size_N,))
+ B = rng.random((size_N, size_M,))
+
+ ref = A.transpose()
+
+ sdfg(A, B, N=size_N)
+ cp.allclose(ref, B)
+
+ code = sdfg.generate_code()[1].clean_code # Get GPU code (second file)
+ m = re.search('dace::SharedToGlobal1D<.+>::Copy', code)
+ assert m is not None
+
+
+@pytest.mark.gpu
+def test_gpu_shared_to_global_1D_accumulate():
+ M = 32
+ N = dace.symbol('N')
+
+ @dace.program
+ def transpose_and_add_shared_to_global(A: dace.float64[M, N], B: dace.float64[N, M]):
+ for i in dace.map[0:N]:
+ local_gather = dace.define_local([M], A.dtype, storage=dace.StorageType.GPU_Shared)
+ for j in dace.map[0:M]:
+ local_gather[j] = A[j, i]
+ local_gather[:] >> B(M, lambda x, y: x + y)[i, :]
+
+
+ sdfg = transpose_and_add_shared_to_global.to_sdfg()
+ auto_optimize.apply_gpu_storage(sdfg)
+
+ size_M = M
+ size_N = 128
+
+ A = rng.random((size_M, size_N,))
+ B = rng.random((size_N, size_M,))
+
+ ref = A.transpose() + B
+
+ sdfg(A, B, N=size_N)
+ cp.allclose(ref, B)
+
+ code = sdfg.generate_code()[1].clean_code # Get GPU code (second file)
+ m = re.search('dace::SharedToGlobal1D<.+>::template Accum', code)
+ assert m is not None
+
+
+if __name__ == '__main__':
+ test_gpu_shared_to_global_1D()
+ test_gpu_shared_to_global_1D_accumulate()
+