Skip to content

Commit

Permalink
DCAFitterGPU: reduce I/O overhead by copying elements using a kernel (A…
Browse files Browse the repository at this point in the history
  • Loading branch information
mconcas authored Oct 3, 2024
1 parent 8957bd6 commit c0c70ae
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 4 deletions.
13 changes: 11 additions & 2 deletions Common/DCAFitter/GPU/cuda/DCAFitterN.cu
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,22 @@ GPUg() void printKernel(Fitter* fitter)
}
}

template <typename Fitter>
GPUg() void initFitters(Fitter* fitters, unsigned int off, unsigned int N)
{
for (auto iThread{blockIdx.x * blockDim.x + threadIdx.x + 1}; iThread < N; iThread += blockDim.x * gridDim.x) {
fitters[iThread + off] = fitters[off];
}
}

template <typename Fitter, typename... Tr>
GPUg() void processKernel(Fitter* fitter, int* res, Tr*... tracks)
{
*res = fitter->process(*tracks...);
}

template <typename Fitter, typename... Tr>
GPUg() void processBatchKernel(Fitter* fitters, int* results, size_t off, size_t N, Tr*... tracks)
GPUg() void processBatchKernel(Fitter* fitters, int* results, unsigned int off, unsigned int N, Tr*... tracks)
{
for (auto iThread{blockIdx.x * blockDim.x + threadIdx.x}; iThread < N; iThread += blockDim.x * gridDim.x) {
results[iThread + off] = fitters[iThread + off].process(tracks[iThread + off]...);
Expand Down Expand Up @@ -186,7 +194,7 @@ void processBulk(const int nBlocks,
auto nFits = batchSize + (iBatch < remainder ? 1 : 0);
gpuCheckError(cudaEventRecord(startIOUp[iBatch], stream));
gpuCheckError(cudaMemcpyAsync(fitters_device + offset, fitters.data() + offset, sizeof(Fitter) * nFits, cudaMemcpyHostToDevice, stream));
gpuCheckError(cudaMemcpyAsync(fitters_device + offset, fitters.data() + offset, sizeof(Fitter) /* * nFits */, cudaMemcpyHostToDevice, stream)); // copying just the first element of the buffer
iArg = 0;
([&] {
gpuCheckError(cudaMemcpyAsync(tracks_device[iArg] + offset, args.data() + offset, sizeof(Tr) * nFits, cudaMemcpyHostToDevice, stream));
Expand All @@ -196,6 +204,7 @@ void processBulk(const int nBlocks,
gpuCheckError(cudaEventRecord(endIOUp[iBatch], stream));
gpuCheckError(cudaEventRecord(startKer[iBatch], stream));
kernel::initFitters<<<nBlocks, nThreads, 0, stream>>>(fitters_device, offset, nFits);
std::apply([&](auto&&... args) { kernel::processBatchKernel<<<nBlocks, nThreads, 0, stream>>>(fitters_device, results_device, offset, nFits, args...); }, tracks_device);
gpuCheckError(cudaEventRecord(endKer[iBatch], stream));
Expand Down
4 changes: 2 additions & 2 deletions Common/DCAFitter/GPU/cuda/test/testDCAFitterNGPU.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -560,8 +560,8 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk)
const char* nBlocksEnvVarName = "DCAFITTERGPU_TEST_NBLOCKS";
const char* nBatchesEnvVarName = "DCAFITTERGPU_TEST_NBATCHES";
const char* nTestsEnvVarName = "DCAFITTERGPU_TEST_NTESTS";
int nBlocks = std::getenv(nThreadsEnvVarName) == nullptr ? 30 : std::stoi(std::getenv(nThreadsEnvVarName));
int nThreads = std::getenv(nBlocksEnvVarName) == nullptr ? 256 : std::stoi(std::getenv(nBlocksEnvVarName));
int nBlocks = std::getenv(nBlocksEnvVarName) == nullptr ? 30 : std::stoi(std::getenv(nBlocksEnvVarName));
int nThreads = std::getenv(nThreadsEnvVarName) == nullptr ? 256 : std::stoi(std::getenv(nThreadsEnvVarName));
int nBatches = std::getenv(nBatchesEnvVarName) == nullptr ? 8 : std::stoi(std::getenv(nBatchesEnvVarName));
int NTest = std::getenv(nTestsEnvVarName) == nullptr ? 100001 : std::stoi(std::getenv(nTestsEnvVarName));

Expand Down

0 comments on commit c0c70ae

Please sign in to comment.