From 852bb9cb6fb707a6fcebff7e068dc6bbdda779cb Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Wed, 18 Sep 2024 17:04:03 +0000
Subject: [PATCH] feat(qtensor): add MarlinQBitsTensor

Adding more tests revealed a bug in the Marlin int4 kernel when the
weights and inputs are large enough.
Failing configurations are marked as xfail.
---
 .../tensor/weights/marlin/int4/__init__.py    |   1 +
 .../tensor/weights/marlin/int4/qbits.py       | 168 ++++++++++++++++++
 .../test_marlin_int4_weight_qbits_tensor.py   | 150 ++++++++++++++++
 3 files changed, 319 insertions(+)
 create mode 100644 optimum/quanto/tensor/weights/marlin/int4/qbits.py
 create mode 100644 test/tensor/weights/optimized/test_marlin_int4_weight_qbits_tensor.py

diff --git a/optimum/quanto/tensor/weights/marlin/int4/__init__.py b/optimum/quanto/tensor/weights/marlin/int4/__init__.py
index 0fdfc89c..2d213c0a 100644
--- a/optimum/quanto/tensor/weights/marlin/int4/__init__.py
+++ b/optimum/quanto/tensor/weights/marlin/int4/__init__.py
@@ -1 +1,2 @@
 from .packed import *
+from .qbits import *
diff --git a/optimum/quanto/tensor/weights/marlin/int4/qbits.py b/optimum/quanto/tensor/weights/marlin/int4/qbits.py
new file mode 100644
index 00000000..4a3aaf1a
--- /dev/null
+++ b/optimum/quanto/tensor/weights/marlin/int4/qbits.py
@@ -0,0 +1,168 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ast
+
+import torch
+from torch.autograd import Function
+
+from ....function import QuantizedLinearFunction
+from ....grouped import group, ungroup
+from ....qtype import qtypes
+from ...qbits import WeightQBitsTensor
+from ..permutations import marlin_permute
+from .packed import MarlinInt4PackedTensor
+
+
+__all__ = ["MarlinInt4WeightQBitsTensor"]
+
+
+class MarlinQBitsDequantizer(Function):
+    @staticmethod
+    def forward(ctx, t):
+        unpacked = t._data.unpack()
+        scale = t._scale
+        shift = t._shift
+        unpacked = group(unpacked, axis=0, group_size=t._group_size)
+        # Apply inverted permutations
+        scale = marlin_permute(scale, reverse=True)
+        shift = marlin_permute(shift, reverse=True)
+        n_scales = scale.numel()
+        scale = scale.t().reshape((n_scales, 1))
+        shift = shift.t().reshape((n_scales, 1))
+        # Shift is already scaled and negated
+        dqt = scale * unpacked + shift
+        return ungroup(dqt, axis=t.axis, orig_shape=t.shape)
+
+    @staticmethod
+    def backward(ctx, gO):
+        return gO
+
+
+class MarlinQBitsLinearFunction(QuantizedLinearFunction):
+    @staticmethod
+    def forward(ctx, input, other, bias):
+        ctx.save_for_backward(input, other)
+        if type(input) is not torch.Tensor:
+            input = input.dequantize()
+        out_features, in_features = other.shape
+        output = torch.ops.quanto.gemm_f16i4_marlin(
+            input,
+            other._data._data,
+            other._scale,
+            other._shift,
+            other._workspace,
+        )
+        if bias is not None:
+            output = output + bias
+        return output
+
+
+class MarlinInt4WeightQBitsTensor(WeightQBitsTensor):
+    @staticmethod
+    def __new__(cls, qtype, axis, group_size, size, stride, data, scale, shift, requires_grad=False):
+        assert data.device.type == "cuda"
+        assert data.device == scale.device
+        assert data.device == shift.device
+        return torch.Tensor._make_wrapper_subclass(
+            cls, size, strides=stride, dtype=scale.dtype, device=data.device, requires_grad=requires_grad
+        )
+
+    def __init__(self, qtype, axis, group_size, size, stride, data, scale, shift, requires_grad=False):
+        assert axis == 0
+        out_features, in_features = size
+        if not isinstance(data, MarlinInt4PackedTensor):
+            assert type(data) is torch.Tensor
+            # Format data, scale and shift for optimized CUDA gemm
+            ungrouped = ungroup(data, axis=0, orig_shape=size)
+            data = MarlinInt4PackedTensor.pack(ungrouped)
+            scale = scale.reshape(out_features, in_features // group_size).t().contiguous()
+            shift = shift.reshape(out_features, in_features // group_size).t()
+            if not shift.dtype.is_floating_point:
+                # Integer shift must be scaled
+                shift = scale * shift
+            # Shift must be negated
+            shift = -shift.contiguous()
+            # Finally, apply scale and shift permutations
+            scale = marlin_permute(scale)
+            shift = marlin_permute(shift)
+        super().__init__(qtype, axis, group_size, size, stride, data, scale, shift)
+        self._workspace = torch.zeros(out_features // 128 * 16, dtype=torch.int, device=data.device)
+
+    def dequantize(self):
+        return MarlinQBitsDequantizer.apply(self)
+
+    def weight_qbits_tensor(self):
+        """Convert back to a WeightQBitsTensor
+
+        This is required to make sure only standard packing is used when serializing.
+        """
+        data = group(self._data.unpack(), axis=self.axis, group_size=self._group_size)
+        scale = marlin_permute(self._scale, reverse=True)
+        shift = marlin_permute(self._shift, reverse=True)
+        n_scales = scale.numel()
+        scale = scale.t().reshape((n_scales, 1))
+        shift = -shift.t().reshape((n_scales, 1))
+        return WeightQBitsTensor(
+            self._qtype, self._axis, self._group_size, self.size(), self.stride(), data, scale, shift
+        )
+
+    def __tensor_flatten__(self):
+        inner_tensors = ["_data", "_scale", "_shift"]
+        # Since meta can be used for serialization, use only strings
+        meta = {
+            "qtype": self._qtype.name,
+            "axis": str(self._axis),
+            "group_size": str(self._group_size),
+            "size": str(list(self.size())),
+            "stride": str(list(self.stride())),
+        }
+        return inner_tensors, meta
+
+    @staticmethod
+    def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
+        assert len(inner_tensors) == 3
+        assert len(meta) == 5
+        data, scale, shift = inner_tensors["_data"], inner_tensors["_scale"], inner_tensors["_shift"]
+        # Meta should only contain strings, AST compatible except qtype
+        qtype = qtypes[meta["qtype"]]
+        axis = ast.literal_eval(meta["axis"])
+        group_size = ast.literal_eval(meta["group_size"])
+        size = ast.literal_eval(meta["size"])
+        stride = ast.literal_eval(meta["stride"])
+        return MarlinInt4WeightQBitsTensor(qtype, axis, group_size, size, stride, data, scale, shift)
+
+    @classmethod
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        """Dispatch torch functions applied on this subtensor
+
+        This method is called whenever a torch function (such as `torch.nn.functional.linear`)
+        is called with at least one parameter coresponding to this subtensor:
+
+        - if a quantized implementation exists for the selected function, it is called,
+        - otherwise, the original implementation is called, deactivating further functional dispatch.
+
+        During the execution of the standard torch function, a second-level of dispatch will
+        happen, but this time directly on individual torch Tensor operations (mainly ATEN).
+        """
+        kwargs = kwargs or {}
+        if func is torch.nn.functional.linear:
+
+            def qlinear(input, other, bias=None):
+                return MarlinQBitsLinearFunction.apply(input, other, bias)
+
+            return qlinear(*args, **kwargs)
+        # Defer to operations dispatcher
+        with torch._C.DisableTorchFunctionSubclass():
+            return func(*args, **kwargs)
diff --git a/test/tensor/weights/optimized/test_marlin_int4_weight_qbits_tensor.py b/test/tensor/weights/optimized/test_marlin_int4_weight_qbits_tensor.py
new file mode 100644
index 00000000..a44db5b2
--- /dev/null
+++ b/test/tensor/weights/optimized/test_marlin_int4_weight_qbits_tensor.py
@@ -0,0 +1,150 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch
+from helpers import device_eq, random_qweight
+from tensor.weights.weight_helpers import check_weight_qtensor_linear
+
+from optimum.quanto import qint4
+from optimum.quanto.library.extensions import is_extension_available
+from optimum.quanto.tensor.weights import WeightQBitsTensor
+from optimum.quanto.tensor.weights.marlin.int4 import MarlinInt4WeightQBitsTensor
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available() or torch.cuda.get_device_capability()[0] < 8, reason="CUDA >= sm80 not available"
+)
+@pytest.mark.parametrize("in_features", [128, 256, 512, 1024])
+@pytest.mark.parametrize("out_features", [128, 256, 512, 1024])
+def test_marlin_int4_weight_qbits_tensor_from_qbits_tensor(in_features, out_features):
+    qtype = qint4
+    group_size = 128
+    dtype = torch.float16
+    shape = (out_features, in_features)
+    device = torch.device("cuda")
+    qbt = random_qweight(shape, qtype, dtype, group_size=group_size, device=device)
+    # Create a MarlinInt4WeightQBitsTensor from the WeightQBitsTensor members
+    marlinqbt = MarlinInt4WeightQBitsTensor(
+        qtype=qbt.qtype,
+        axis=qbt.axis,
+        group_size=qbt._group_size,
+        size=qbt.size(),
+        stride=qbt.stride(),
+        data=qbt._data.unpack(),
+        scale=qbt._scale,
+        shift=qbt._shift,
+    )
+    assert marlinqbt.dtype == dtype
+    assert marlinqbt.qtype == qtype
+    assert marlinqbt.shape == shape
+    assert device_eq(marlinqbt.device, device)
+    # Verify the dequantized tensors are identical
+    assert torch.equal(marlinqbt.dequantize(), qbt.dequantize())
+    # Now verify that we can reconstruct the WeightQBitsTensor
+    new_qbt = marlinqbt.weight_qbits_tensor()
+    assert type(new_qbt) is WeightQBitsTensor
+    assert new_qbt.dtype == dtype
+    assert new_qbt.qtype == qtype
+    assert new_qbt.shape == shape
+    assert torch.equal(new_qbt._data, qbt._data)
+    assert torch.equal(new_qbt._scale, qbt._scale)
+    assert torch.equal(new_qbt._shift, qbt._shift)
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_marlin_int4_weight_qbits_tensor_move(device):
+    qtype = qint4
+    group_size = 128
+    dtype = torch.float16
+    shape = (1024, 1024)
+    device = torch.device("cuda")
+    # Create an MarlinInt4WeightQBitsTensor from a QBitsTensor on CUDA
+    qbt = random_qweight(shape, qtype, dtype, group_size=group_size, device=torch.device("cuda"))
+    marlinqbt = MarlinInt4WeightQBitsTensor(
+        qtype=qbt.qtype,
+        axis=qbt.axis,
+        group_size=qbt._group_size,
+        size=qbt.size(),
+        stride=qbt.stride(),
+        data=qbt._data.unpack(),
+        scale=qbt._scale,
+        shift=qbt._shift,
+    )
+    # Move to device, dequantize and compare
+    moved_qbt = marlinqbt.to(device)
+    assert isinstance(moved_qbt, WeightQBitsTensor)
+    if device.type != "cuda":
+        assert type(moved_qbt) is not MarlinInt4WeightQBitsTensor
+    assert marlinqbt.dtype == moved_qbt.dtype
+    assert marlinqbt.qtype == moved_qbt.qtype
+    assert marlinqbt.shape == moved_qbt.shape
+    assert torch.equal(marlinqbt.dequantize().to(device), moved_qbt.dequantize())
+
+
+def _test_marlin_int4_weight_qbits_tensor_linear(
+    dtype, weight_qtype, group_size, batch_size, tokens, in_features, out_features, use_bias
+):
+    # Create an MarlinInt4WeightQBitsTensor from a QBitsTensor on CUDA
+    qbt = random_qweight(
+        (out_features, in_features), weight_qtype, dtype, group_size=group_size, device=torch.device("cuda")
+    )
+    marlin_qweight = MarlinInt4WeightQBitsTensor(
+        qtype=qbt.qtype,
+        axis=qbt.axis,
+        group_size=qbt._group_size,
+        size=qbt.size(),
+        stride=qbt.stride(),
+        data=qbt._data.unpack(),
+        scale=qbt._scale,
+        shift=qbt._shift,
+    )
+    check_weight_qtensor_linear(marlin_qweight, batch_size, tokens, use_bias)
+
+
+@pytest.mark.skipif(
+    not is_extension_available("quanto_cuda") or torch.cuda.get_device_capability()[0] < 8,
+    reason="CUDA >= sm80 not available",
+)
+@pytest.mark.parametrize("batch_size", [1, 2])
+@pytest.mark.parametrize("tokens", [16, 32])
+@pytest.mark.parametrize("in_features", [1024])
+@pytest.mark.parametrize("out_features", [1024, 2048, 4096])
+@pytest.mark.parametrize("use_bias", [True, False], ids=["bias", "no-bias"])
+def test_marlin_int4_weight_qbits_tensor_linear(batch_size, tokens, in_features, out_features, use_bias):
+    dtype = torch.float16
+    weight_qtype = qint4
+    group_size = 128
+    _test_marlin_int4_weight_qbits_tensor_linear(
+        dtype, weight_qtype, group_size, batch_size, tokens, in_features, out_features, use_bias
+    )
+
+
+@pytest.mark.xfail(reason="Bug in Marlin kernel", strict=False)
+@pytest.mark.skipif(
+    not is_extension_available("quanto_cuda") or torch.cuda.get_device_capability()[0] < 8,
+    reason="CUDA >= sm80 not available",
+)
+@pytest.mark.parametrize("batch_size", [1, 2])
+@pytest.mark.parametrize("tokens", [48, 64])
+# @pytest.mark.parametrize("in_features", [1024, 2048, 4096, 16384])
+@pytest.mark.parametrize("in_features", [4096, 16384])
+@pytest.mark.parametrize("out_features", [2048, 4096])
+def test_marlin_int4_weight_qbits_tensor_linear_failing(batch_size, tokens, in_features, out_features):
+    dtype = torch.float16
+    weight_qtype = qint4
+    group_size = 128
+    _test_marlin_int4_weight_qbits_tensor_linear(
+        dtype, weight_qtype, group_size, batch_size, tokens, in_features, out_features, use_bias=False
+    )