diff --git a/csrc/kernels.hip b/csrc/kernels.hip
index 723504fa8..54b6afb9d 100644
--- a/csrc/kernels.hip
+++ b/csrc/kernels.hip
@@ -500,7 +500,7 @@ __global__ void kHistogramScatterAdd2D(float* histogram, int *index1, int *index
 template<typename T, int BLOCK_SIZE, int NUM_MAX>
 __global__ void kCompressMax(T * __restrict__ const A, T* out, unsigned char* out_idx, const int n)
 {
-  typedef hipcub::WarpReduce<T> WarpReduce;
+  typedef hipcub::WarpReduce<T, 32> WarpReduce;
   __shared__ typename WarpReduce::TempStorage temp_storage;
   typedef hipcub::BlockLoad<T, BLOCK_SIZE/8 , 8, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
   __shared__ typename LoadT::TempStorage loadt;
@@ -3553,7 +3553,7 @@ template <typename T, int THREADS, int BITS> __global__ void kgemm_4bit_inferenc
   // load step-by-step in chunks of [32,warps]: 1x32 * [32,warps] -> [1,warps]
   // 4 warps -> 4 loads per iter
   // 1x32 * 32x4 -> 1x4 outputs per thread block
-  typedef hipcub::WarpReduce<float> WarpReduce;
+  typedef hipcub::WarpReduce<float, 32> WarpReduce;
   __shared__ typename WarpReduce::TempStorage temp_storage[THREADS/32];
 
   const int warp_idx = threadIdx.x / 32;
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 4591bd85c..7d8df2e48 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -2543,7 +2543,6 @@ def test_managed():
 @pytest.mark.parametrize("storage_type", ['nf4', 'fp4'], ids=['nf4', 'fp4'])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=['fp16', 'bf16', 'fp32'])
 @pytest.mark.parametrize("double_quant", [False], ids=['DQ_True'])
-@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 def test_gemv_eye_4bit(storage_type, dtype, double_quant):
     dims = 10
     torch.random.manual_seed(np.random.randint(0, 412424242))