Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix for CUDA codegen #1442

Merged
merged 26 commits into from
Dec 18, 2023
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
9049210
[bug] Fix for floordiv codegen
edopao Nov 21, 2023
54bf6c1
[fix] Fix lowering to CUDA coda
edopao Nov 21, 2023
f4a73f0
[bug] Missing symbols for data access on inter-state edge
edopao Nov 17, 2023
880f247
[test] Add gpu version of indirection test
edopao Nov 22, 2023
47856d8
[test] Add missing import for gpu test
edopao Nov 22, 2023
08a036e
[test] Minor edit
edopao Nov 22, 2023
9de8e60
[cuda] Revert SharedToGlobal1D to old codegen
edopao Nov 22, 2023
0e37b12
Remove extra changes
edopao Nov 22, 2023
87e947a
Different solution which keeps new template
edopao Nov 23, 2023
12b5321
Fix cuda codegen for 1D dynamic copy
edopao Nov 23, 2023
57f8c02
Merge remote-tracking branch 'origin/master' into bug-gpu-codegen
edopao Nov 23, 2023
54d0c67
Fix for broken test
edopao Nov 24, 2023
190f075
Merge remote-tracking branch 'origin/master' into bug-gpu-codegen-wip
edopao Dec 5, 2023
721cb32
Apply type-specialization to template for ifloor
edopao Dec 5, 2023
d53d25b
Address review comments
edopao Dec 5, 2023
fa967ea
Merge pull request #2 from edopao/bug-gpu-codegen-wip
edopao Dec 5, 2023
140aad4
Revert change for ifloor bugfix
edopao Dec 5, 2023
39ef8e8
Add test case for neighbor reduction
edopao Dec 5, 2023
289fba2
Merge pull request #3 from edopao/bug-gpu-codegen-wip
edopao Dec 5, 2023
12b3cdd
Replace init state with edge assignment
edopao Dec 6, 2023
150b4c4
Update test case for CUDA-codegen (#6)
edopao Dec 11, 2023
acf0e2d
Merge branch 'spcl:master' into bug-gpu-codegen
edopao Dec 11, 2023
d839089
Merge remote-tracking branch 'origin/master' into bug-gpu-codegen
edopao Dec 12, 2023
8b99971
Correction for is_async arg in gpu memory copies
edopao Dec 12, 2023
f5cd14b
Move __syncthreads after thread copy
edopao Dec 12, 2023
0dbaff2
Merge branch 'master' into bug-gpu-codegen
edopao Dec 18, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions dace/codegen/targets/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -1132,10 +1132,22 @@ def _emit_copy(self, state_id, src_node, src_storage, dst_node, dst_storage, dst
func=funcname,
type=dst_node.desc(sdfg).dtype.ctype,
bdims=', '.join(_topy(self._block_dims)),
is_async='true' if state_dfg.out_degree(dst_node) > 0 else 'true',
is_async='true' if state_dfg.out_degree(dst_node) > 0 else 'false',
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should be the other way around (if there is a dependent read after it in the same state, sync).

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct. I did not pay enough attention to is_async before. Besides correcting the value of this argument, I have also moved the synchronization point in the template function after the thread-level copy (see my last commit on copy.cuh).

accum=accum,
args=', '.join([src_expr] + _topy(src_strides) + [dst_expr] + custom_reduction +
_topy(dst_strides) + _topy(copy_shape))), sdfg, state_id, [src_node, dst_node])
elif funcname == 'dace::SharedToGlobal1D':
# special case: use a new template struct that provides functions for copy and reduction
callsite_stream.write(
(' {func}<{type}, {bdims}, {copysize}, {is_async}>{accum}({args});').format(
edopao marked this conversation as resolved.
Show resolved Hide resolved
func=funcname,
type=dst_node.desc(sdfg).dtype.ctype,
bdims=', '.join(_topy(self._block_dims)),
copysize=', '.join(_topy(copy_shape)),
is_async='true' if state_dfg.out_degree(dst_node) > 0 else 'false',
edopao marked this conversation as resolved.
Show resolved Hide resolved
accum=accum or '::Copy',
args=', '.join([src_expr] + _topy(src_strides) + [dst_expr] + _topy(dst_strides) + custom_reduction)), sdfg,
state_id, [src_node, dst_node])
else:
callsite_stream.write(
(' {func}<{type}, {bdims}, {copysize}, ' +
Expand All @@ -1145,7 +1157,7 @@ def _emit_copy(self, state_id, src_node, src_storage, dst_node, dst_storage, dst
bdims=', '.join(_topy(self._block_dims)),
copysize=', '.join(_topy(copy_shape)),
dststrides=', '.join(_topy(dst_strides)),
is_async='true' if state_dfg.out_degree(dst_node) > 0 else 'true',
is_async='true' if state_dfg.out_degree(dst_node) > 0 else 'false',
edopao marked this conversation as resolved.
Show resolved Hide resolved
accum=accum,
args=', '.join([src_expr] + _topy(src_strides) + [dst_expr] + custom_reduction)), sdfg,
state_id, [src_node, dst_node])
Expand Down
45 changes: 31 additions & 14 deletions dace/runtime/include/dace/cuda/copy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -736,58 +736,75 @@ namespace dace
int COPY_XLEN, bool ASYNC>
struct SharedToGlobal1D
{
static constexpr int BLOCK_SIZE = BLOCK_WIDTH * BLOCK_HEIGHT * BLOCK_DEPTH;
static constexpr int TOTAL = COPY_XLEN;
static constexpr int WRITES = TOTAL / BLOCK_SIZE;
static constexpr int REM_WRITES = TOTAL % BLOCK_SIZE;

static DACE_DFI void Copy(const T *smem, int src_xstride, T *ptr, int dst_xstride)
{
if (!ASYNC)
__syncthreads();

// Linear thread ID
int ltid = GetLinearTID<BLOCK_WIDTH, BLOCK_HEIGHT, BLOCK_DEPTH>();

#pragma unroll
for (int i = 0; i < WRITES; ++i) {
*(ptr + (ltid + i * BLOCK_SIZE) * dst_xstride) =
*(smem + (ltid + i * BLOCK_SIZE) * src_xstride);
}

if (REM_WRITES != 0 && ltid < REM_WRITES) {
*(ptr + (ltid + WRITES*BLOCK_SIZE)* dst_xstride) =
*(smem + (ltid + WRITES * BLOCK_SIZE) * src_xstride);
}
}

template <typename WCR>
static DACE_DFI void Accum(const T *smem, int src_xstride, T *ptr, int DST_XSTRIDE, WCR wcr)
static DACE_DFI void Accum(const T *smem, int src_xstride, T *ptr, int dst_xstride, WCR wcr)
{
if (!ASYNC)
__syncthreads();

// Linear thread ID
int ltid = GetLinearTID<BLOCK_WIDTH, BLOCK_HEIGHT, BLOCK_DEPTH>();
constexpr int BLOCK_SIZE = BLOCK_WIDTH * BLOCK_HEIGHT * BLOCK_DEPTH;
constexpr int TOTAL = COPY_XLEN;
constexpr int WRITES = TOTAL / BLOCK_SIZE;
constexpr int REM_WRITES = TOTAL % BLOCK_SIZE;

#pragma unroll
for (int i = 0; i < WRITES; ++i) {
wcr_custom<T>::template reduce(
wcr, ptr + (ltid + i * BLOCK_SIZE) * DST_XSTRIDE,
wcr, ptr + (ltid + i * BLOCK_SIZE) * dst_xstride,
*(smem + (ltid + i * BLOCK_SIZE) * src_xstride));
}

if (REM_WRITES != 0) {
if (ltid < REM_WRITES)
wcr_custom<T>::template reduce(
ptr + (ltid + WRITES * BLOCK_SIZE)* DST_XSTRIDE,
ptr + (ltid + WRITES * BLOCK_SIZE)* dst_xstride,
*(smem + (ltid + WRITES * BLOCK_SIZE) * src_xstride));
}
}

template <ReductionType REDTYPE>
static DACE_DFI void Accum(const T *smem, int src_xstride, T *ptr, int DST_XSTRIDE)
static DACE_DFI void Accum(const T *smem, int src_xstride, T *ptr, int dst_xstride)
{
if (!ASYNC)
__syncthreads();

// Linear thread ID
int ltid = GetLinearTID<BLOCK_WIDTH, BLOCK_HEIGHT, BLOCK_DEPTH>();
constexpr int BLOCK_SIZE = BLOCK_WIDTH * BLOCK_HEIGHT * BLOCK_DEPTH;
constexpr int TOTAL = COPY_XLEN;
constexpr int WRITES = TOTAL / BLOCK_SIZE;
constexpr int REM_WRITES = TOTAL % BLOCK_SIZE;

#pragma unroll
for (int i = 0; i < WRITES; ++i) {
wcr_fixed<REDTYPE, T>::template reduce_atomic(
ptr + (ltid + i * BLOCK_SIZE) * DST_XSTRIDE,
ptr + (ltid + i * BLOCK_SIZE) * dst_xstride,
*(smem + (ltid + i * BLOCK_SIZE) * src_xstride));
}

if (REM_WRITES != 0) {
if (ltid < REM_WRITES)
wcr_fixed<REDTYPE, T>::template reduce_atomic(
ptr + (ltid + WRITES*BLOCK_SIZE)* DST_XSTRIDE,
ptr + (ltid + WRITES*BLOCK_SIZE)* dst_xstride,
*(smem + (ltid + WRITES * BLOCK_SIZE) * src_xstride));
}
}
Expand Down
84 changes: 84 additions & 0 deletions tests/codegen/cuda_memcopy_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
""" Tests code generation for array copy on GPU target. """
import dace
from dace.transformation.auto import auto_optimize

import pytest
import re

# this test requires cupy module
cp = pytest.importorskip("cupy")

# initialize random number generator
rng = cp.random.default_rng(42)


@pytest.mark.gpu
def test_gpu_shared_to_global_1D():
M = 32
N = dace.symbol('N')

@dace.program
def transpose_shared_to_global(A: dace.float64[M, N], B: dace.float64[N, M]):
for i in dace.map[0:N]:
local_gather = dace.define_local([M], A.dtype, storage=dace.StorageType.GPU_Shared)
for j in dace.map[0:M]:
local_gather[j] = A[j, i]
B[i, :] = local_gather


sdfg = transpose_shared_to_global.to_sdfg()
auto_optimize.apply_gpu_storage(sdfg)

size_M = M
size_N = 128

A = rng.random((size_M, size_N,))
B = rng.random((size_N, size_M,))

ref = A.transpose()

sdfg(A, B, N=size_N)
cp.allclose(ref, B)

code = sdfg.generate_code()[1].clean_code # Get GPU code (second file)
m = re.search('dace::SharedToGlobal1D<.+>::Copy', code)
assert m is not None


@pytest.mark.gpu
def test_gpu_shared_to_global_1D_accumulate():
M = 32
N = dace.symbol('N')

@dace.program
def transpose_and_add_shared_to_global(A: dace.float64[M, N], B: dace.float64[N, M]):
for i in dace.map[0:N]:
local_gather = dace.define_local([M], A.dtype, storage=dace.StorageType.GPU_Shared)
for j in dace.map[0:M]:
local_gather[j] = A[j, i]
local_gather[:] >> B(M, lambda x, y: x + y)[i, :]


sdfg = transpose_and_add_shared_to_global.to_sdfg()
auto_optimize.apply_gpu_storage(sdfg)

size_M = M
size_N = 128

A = rng.random((size_M, size_N,))
B = rng.random((size_N, size_M,))

ref = A.transpose() + B

sdfg(A, B, N=size_N)
cp.allclose(ref, B)

code = sdfg.generate_code()[1].clean_code # Get GPU code (second file)
m = re.search('dace::SharedToGlobal1D<.+>::template Accum', code)
assert m is not None


if __name__ == '__main__':
test_gpu_shared_to_global_1D()
test_gpu_shared_to_global_1D_accumulate()

Loading