diff --git a/csrc/kernels.hip b/csrc/kernels.hip index 723504fa8..54b6afb9d 100644 --- a/csrc/kernels.hip +++ b/csrc/kernels.hip @@ -500,7 +500,7 @@ __global__ void kHistogramScatterAdd2D(float* histogram, int *index1, int *index template __global__ void kCompressMax(T * __restrict__ const A, T* out, unsigned char* out_idx, const int n) { - typedef hipcub::WarpReduce WarpReduce; + typedef hipcub::WarpReduce WarpReduce; __shared__ typename WarpReduce::TempStorage temp_storage; typedef hipcub::BlockLoad LoadT; __shared__ typename LoadT::TempStorage loadt; @@ -3553,7 +3553,7 @@ template __global__ void kgemm_4bit_inferenc // load step-by-step in chunks of [32,warps]: 1x32 * [32,warps] -> [1,warps] // 4 warps -> 4 loads per iter // 1x32 * 32x4 -> 1x4 outputs per thread block - typedef hipcub::WarpReduce WarpReduce; + typedef hipcub::WarpReduce WarpReduce; __shared__ typename WarpReduce::TempStorage temp_storage[THREADS/32]; const int warp_idx = threadIdx.x / 32; diff --git a/tests/test_functional.py b/tests/test_functional.py index 4591bd85c..7d8df2e48 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -2543,7 +2543,6 @@ def test_managed(): @pytest.mark.parametrize("storage_type", ['nf4', 'fp4'], ids=['nf4', 'fp4']) @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=['fp16', 'bf16', 'fp32']) @pytest.mark.parametrize("double_quant", [False], ids=['DQ_True']) -@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet") def test_gemv_eye_4bit(storage_type, dtype, double_quant): dims = 10 torch.random.manual_seed(np.random.randint(0, 412424242))