Add decimal64 div CUDA test

cppalliance · Aug 13, 2024 · 62e5473 · 62e5473
1 parent b48bbb2
commit 62e5473
Show file tree

Hide file tree

Showing 2 changed files with 120 additions and 0 deletions.
diff --git a/test/cuda_jamfile b/test/cuda_jamfile
@@ -21,6 +21,7 @@ run test_decimal32_div.cu ;
 run test_decimal64_add.cu ;
 run test_decimal64_sub.cu ;
 run test_decimal64_mul.cu ;
+run test_decimal64_div.cu ;
 
 # Fast Types
 

diff --git a/test/test_decimal64_div.cu b/test/test_decimal64_div.cu
@@ -0,0 +1,119 @@
+
+// Copyright John Maddock 2016.
+// Copyright Matt Borland 2024.
+// Use, modification and distribution are subject to the
+// Boost Software License, Version 1.0. (See accompanying file
+// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#pragma nv_diag_suppress 186
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <boost/decimal.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using float_type = boost::decimal::decimal64;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type* in1, const float_type* in2, float_type *out, int numElements)
+{
+ using std::cos;
+ int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+ if (i < numElements)
+ {
+ out[i] = in1[i] / in2[i];
+ }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+ using namespace boost::decimal;
+
+ // Error code to check return values for CUDA calls
+ cudaError_t err = cudaSuccess;
+
+ // Print the vector length to be used, and compute its size
+ int numElements = 50000;
+ std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+ // Allocate the managed input vector A
+ cuda_managed_ptr<float_type> input_vector1(numElements);
+
+ // Allocate the managed input vector B
+ cuda_managed_ptr<float_type> input_vector2(numElements);
+
+ // Allocate the managed output vector C
+ cuda_managed_ptr<float_type> output_vector(numElements);
+
+ // Initialize the input vectors
+ std::mt19937_64 rng(42);
+ std::uniform_int_distribution<int> dist(-1000, 1000);
+ for (int i = 0; i < numElements; ++i)
+ {
+ input_vector1[i] = static_cast<float_type>(dist(rng));
+ input_vector2[i] = static_cast<float_type>(dist(rng));
+ }
+
+ // Launch the Vector Add CUDA Kernel
+ int threadsPerBlock = 1024;
+ int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+ std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+ watch w;
+
+ cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+ cudaDeviceSynchronize();
+
+ std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+ err = cudaGetLastError();
+
+ if (err != cudaSuccess)
+ {
+ std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ // Verify that the result vector is correct
+ std::vector<float_type> results;
+ results.reserve(numElements);
+ w.reset();
+ for(int i = 0; i < numElements; ++i)
+ {
+ results.push_back(input_vector1[i] / input_vector2[i]);
+ }
+ double t = w.elapsed();
+ // check the results
+ for(int i = 0; i < numElements; ++i)
+ {
+ // Add in additional handling because CUDA does not believe in signed 0
+ if (abs(output_vector[i]) != abs(results[i]))
+ {
+ if (isfinite(output_vector[i]) && isfinite(results[i]))
+ {
+ std::cerr << "Result verification failed at element " << i << "!\n"
+ << "Cuda: " << output_vector[i] << '\n'
+ << "Serial: " << results[i] << std::endl;
+ return EXIT_FAILURE;
+ }
+ }
+ }
+
+ std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+ std::cout << "Done\n";
+
+ return 0;
+}