Skip to content

Commit

Permalink
Add decimal64 div CUDA test
Browse files Browse the repository at this point in the history
  • Loading branch information
mborland committed Aug 13, 2024
1 parent b48bbb2 commit 62e5473
Show file tree
Hide file tree
Showing 2 changed files with 120 additions and 0 deletions.
1 change: 1 addition & 0 deletions test/cuda_jamfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ run test_decimal32_div.cu ;
run test_decimal64_add.cu ;
run test_decimal64_sub.cu ;
run test_decimal64_mul.cu ;
run test_decimal64_div.cu ;

# Fast Types

Expand Down
119 changes: 119 additions & 0 deletions test/test_decimal64_div.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@

// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

#pragma nv_diag_suppress 186

#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <boost/decimal.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"

// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>

using float_type = boost::decimal::decimal64;

/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type* in1, const float_type* in2, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;

if (i < numElements)
{
out[i] = in1[i] / in2[i];
}
}

/**
* Host main routine
*/
int main(void)
{
using namespace boost::decimal;

// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;

// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;

// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);

// Allocate the managed input vector B
cuda_managed_ptr<float_type> input_vector2(numElements);

// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);

// Initialize the input vectors
std::mt19937_64 rng(42);
std::uniform_int_distribution<int> dist(-1000, 1000);
for (int i = 0; i < numElements; ++i)
{
input_vector1[i] = static_cast<float_type>(dist(rng));
input_vector2[i] = static_cast<float_type>(dist(rng));
}

// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 1024;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;

watch w;

cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();

std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;

err = cudaGetLastError();

if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}

// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
{
results.push_back(input_vector1[i] / input_vector2[i]);
}
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
// Add in additional handling because CUDA does not believe in signed 0
if (abs(output_vector[i]) != abs(results[i]))
{
if (isfinite(output_vector[i]) && isfinite(results[i]))
{
std::cerr << "Result verification failed at element " << i << "!\n"
<< "Cuda: " << output_vector[i] << '\n'
<< "Serial: " << results[i] << std::endl;
return EXIT_FAILURE;
}
}
}

std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";

return 0;
}

0 comments on commit 62e5473

Please sign in to comment.