DCAFitterGPU: reduce I/O overhead by copying elements using a kernel (A…

…liceO2Group#13556)
MichaelLettrich · Oct 3, 2024 · c0c70ae · c0c70ae
1 parent 8957bd6
commit c0c70ae
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 4 deletions.
diff --git a/Common/DCAFitter/GPU/cuda/DCAFitterN.cu b/Common/DCAFitter/GPU/cuda/DCAFitterN.cu
@@ -56,14 +56,22 @@ GPUg() void printKernel(Fitter* fitter)
  }
 }
 
+template <typename Fitter>
+GPUg() void initFitters(Fitter* fitters, unsigned int off, unsigned int N)
+{
+ for (auto iThread{blockIdx.x * blockDim.x + threadIdx.x + 1}; iThread < N; iThread += blockDim.x * gridDim.x) {
+ fitters[iThread + off] = fitters[off];
+ }
+}
+
 template <typename Fitter, typename... Tr>
 GPUg() void processKernel(Fitter* fitter, int* res, Tr*... tracks)
 {
  *res = fitter->process(*tracks...);
 }
 
 template <typename Fitter, typename... Tr>
-GPUg() void processBatchKernel(Fitter* fitters, int* results, size_t off, size_t N, Tr*... tracks)
+GPUg() void processBatchKernel(Fitter* fitters, int* results, unsigned int off, unsigned int N, Tr*... tracks)
 {
  for (auto iThread{blockIdx.x * blockDim.x + threadIdx.x}; iThread < N; iThread += blockDim.x * gridDim.x) {
  results[iThread + off] = fitters[iThread + off].process(tracks[iThread + off]...);
@@ -186,7 +194,7 @@ void processBulk(const int nBlocks,
  auto nFits = batchSize + (iBatch < remainder ? 1 : 0);
 
  gpuCheckError(cudaEventRecord(startIOUp[iBatch], stream));
- gpuCheckError(cudaMemcpyAsync(fitters_device + offset, fitters.data() + offset, sizeof(Fitter) * nFits, cudaMemcpyHostToDevice, stream));
+ gpuCheckError(cudaMemcpyAsync(fitters_device + offset, fitters.data() + offset, sizeof(Fitter) /* * nFits */, cudaMemcpyHostToDevice, stream)); // copying just the first element of the buffer
  iArg = 0;
  ([&] {
  gpuCheckError(cudaMemcpyAsync(tracks_device[iArg] + offset, args.data() + offset, sizeof(Tr) * nFits, cudaMemcpyHostToDevice, stream));
@@ -196,6 +204,7 @@ void processBulk(const int nBlocks,
  gpuCheckError(cudaEventRecord(endIOUp[iBatch], stream));
 
  gpuCheckError(cudaEventRecord(startKer[iBatch], stream));
+ kernel::initFitters<<<nBlocks, nThreads, 0, stream>>>(fitters_device, offset, nFits);
  std::apply([&](auto&&... args) { kernel::processBatchKernel<<<nBlocks, nThreads, 0, stream>>>(fitters_device, results_device, offset, nFits, args...); }, tracks_device);
  gpuCheckError(cudaEventRecord(endKer[iBatch], stream));
 

diff --git a/Common/DCAFitter/GPU/cuda/test/testDCAFitterNGPU.cxx b/Common/DCAFitter/GPU/cuda/test/testDCAFitterNGPU.cxx
@@ -560,8 +560,8 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk)
  const char* nBlocksEnvVarName = "DCAFITTERGPU_TEST_NBLOCKS";
  const char* nBatchesEnvVarName = "DCAFITTERGPU_TEST_NBATCHES";
  const char* nTestsEnvVarName = "DCAFITTERGPU_TEST_NTESTS";
- int nBlocks = std::getenv(nThreadsEnvVarName) == nullptr ? 30 : std::stoi(std::getenv(nThreadsEnvVarName));
- int nThreads = std::getenv(nBlocksEnvVarName) == nullptr ? 256 : std::stoi(std::getenv(nBlocksEnvVarName));
+ int nBlocks = std::getenv(nBlocksEnvVarName) == nullptr ? 30 : std::stoi(std::getenv(nBlocksEnvVarName));
+ int nThreads = std::getenv(nThreadsEnvVarName) == nullptr ? 256 : std::stoi(std::getenv(nThreadsEnvVarName));
  int nBatches = std::getenv(nBatchesEnvVarName) == nullptr ? 8 : std::stoi(std::getenv(nBatchesEnvVarName));
  int NTest = std::getenv(nTestsEnvVarName) == nullptr ? 100001 : std::stoi(std::getenv(nTestsEnvVarName));