hf_text-generation-inference/server/text_generation_server/layers/marlin.py

from dataclasses import dataclass
from typing import Optional

import torch
import torch.nn as nn

try:
    import marlin
except ImportError:
    marlin = None

try:
    major, _minor = torch.cuda.get_device_capability()
    has_sm_8_0 = major >= 8
except Exception:
    has_sm_8_0 = False

MARLIN_TILE_SIZE = 16


@dataclass
class MarlinWeight:
    """
    Marlin weights.

    Attributes:
        B (torch.Tensor): int4-quantized weights packed into int32.
        s (torch.Tensor): float16 scales.
    """

    B: torch.Tensor
    s: torch.Tensor


class MarlinLinear(nn.Module):
    def __init__(
        self, *, B: torch.Tensor, s: torch.Tensor, bias: Optional[torch.Tensor]
    ):
        super().__init__()

        if not has_sm_8_0:
            raise NotImplementedError(
                "Using quantized marlin models requires CUDA capability 8.0 or later"
            )

        if marlin is None:
            raise NotImplementedError(
                "You do not seem to have marlin installed, either install it (cd server &&  make install-marlin)"
            )

        assert B.dtype == torch.int32
        assert s.dtype == torch.float16

        in_features = B.shape[0] * MARLIN_TILE_SIZE
        out_features = s.shape[1]
        assert (
            in_features % 128 == 0
        ), f"Number of input features ({in_features}) not divisable by 128"
        assert (
            out_features % 256 == 0
        ), f"Number of output features ({out_features}) not divisable by 256"

        group_size = -1 if s.shape[0] == 1 else in_features // s.shape[0]
        assert group_size in {
            -1,
            128,
        }, f"Group size must be -1 or 128, was {group_size}"

        self.register_buffer("B", B)
        self.register_buffer("s", s)
        if bias is not None:
            self.register_buffer("bias", bias)
        else:
            self.bias = None

        self.workspace = torch.zeros(
            out_features // 128 * 16, dtype=torch.int, device=B.device
        )

    def forward(self, A: torch.Tensor) -> torch.Tensor:
        assert marlin is not None
        C = torch.empty(
            A.shape[:-1] + (self.s.shape[1],), dtype=A.dtype, device=A.device
        )
        marlin.mul(
            A.view((-1, A.shape[-1])),
            self.B,
            C.view((-1, C.shape[-1])),
            self.s,
            self.workspace,
        )

        if self.bias is not None:
            C += self.bias

        return C
Add support for Marlin-quantized models This change adds support for Marlin-quantized models. Marlin is an FP16xINT4 matmul kernel, which provides good speedups decoding batches of 16-32 tokens. It supports quantized models with symmetric quantization, groupsize -1 or 128, and 4-bit. Tested with: - Llama 2 - Llama 3 - Phi 3 2024-06-05 02:14:40 -06:00			`from dataclasses import dataclass`
			`from typing import Optional`

			`import torch`
			`import torch.nn as nn`

			`try:`
			`import marlin`
			`except ImportError:`
			`marlin = None`

			`try:`
			`major, _minor = torch.cuda.get_device_capability()`
			`has_sm_8_0 = major >= 8`
			`except Exception:`
			`has_sm_8_0 = False`

			`MARLIN_TILE_SIZE = 16`


			`@dataclass`
			`class MarlinWeight:`
			`"""`
			`Marlin weights.`

			`Attributes:`
			`B (torch.Tensor): int4-quantized weights packed into int32.`
			`s (torch.Tensor): float16 scales.`
			`"""`

			`B: torch.Tensor`
			`s: torch.Tensor`


			`class MarlinLinear(nn.Module):`
			`def __init__(`
			`self, *, B: torch.Tensor, s: torch.Tensor, bias: Optional[torch.Tensor]`
			`):`
			`super().__init__()`

			`if not has_sm_8_0:`
			`raise NotImplementedError(`
			`"Using quantized marlin models requires CUDA capability 8.0 or later"`
			`)`

			`if marlin is None:`
			`raise NotImplementedError(`
			`"You do not seem to have marlin installed, either install it (cd server && make install-marlin)"`
			`)`

			`assert B.dtype == torch.int32`
			`assert s.dtype == torch.float16`

			`in_features = B.shape[0] * MARLIN_TILE_SIZE`
			`out_features = s.shape[1]`
			`assert (`
			`in_features % 128 == 0`
			`), f"Number of input features ({in_features}) not divisable by 128"`
			`assert (`
			`out_features % 256 == 0`
			`), f"Number of output features ({out_features}) not divisable by 256"`

			`group_size = -1 if s.shape[0] == 1 else in_features // s.shape[0]`
			`assert group_size in {`
			`-1,`
			`128,`
			`}, f"Group size must be -1 or 128, was {group_size}"`

			`self.register_buffer("B", B)`
			`self.register_buffer("s", s)`
			`if bias is not None:`
			`self.register_buffer("bias", bias)`
			`else:`
			`self.bias = None`

			`self.workspace = torch.zeros(`
			`out_features // 128 * 16, dtype=torch.int, device=B.device`
			`)`

			`def forward(self, A: torch.Tensor) -> torch.Tensor:`
			`assert marlin is not None`
			`C = torch.empty(`
			`A.shape[:-1] + (self.s.shape[1],), dtype=A.dtype, device=A.device`
			`)`
			`marlin.mul(`
			`A.view((-1, A.shape[-1])),`
			`self.B,`
			`C.view((-1, C.shape[-1])),`
			`self.s,`
			`self.workspace,`
			`)`

			`if self.bias is not None:`
			`C += self.bias`

			`return C`