Add support for FP8 on compute capability >=8.0, <8.9 (#2213)

Use FP8 GPTQ-Marlin kernels to enable FP8 support on CUDA GPUs with compute capability >=8.0 and <8.9. Co-authored-by: Florian Zimmermeister <flozi00.fz@gmail.com>
2024-07-11 16:03:26 +02:00 · 2024-07-11 16:03:26 +02:00 · cb150eb295
parent 8511669cb2
commit cb150eb295
8 changed files with 1465 additions and 4 deletions
--- a/server/marlin/marlin_kernels/init.pyi
+++ b/server/marlin/marlin_kernels/init.pyi
@ -59,3 +59,18 @@ def marlin_gemm(
    Matrix multiplication using Marlin kernels.
    """
    ...
 # fp8 marlin
 def fp8_marlin_gemm(
    a: torch.Tensor,
    b_q_weight: torch.Tensor,
    b_scales: torch.Tensor,
    workspace: torch.Tensor,
    num_bits: int,
    size_m: int,
    size_n: int,
    size_k: int,
 ) -> torch.Tensor:
    return torch.ops._C.fp8_marlin_gemm(
        a, b_q_weight, b_scales, workspace, num_bits, size_m, size_n, size_k
    )
--- a/server/marlin/marlin_kernels/ext.cpp
+++ b/server/marlin/marlin_kernels/ext.cpp
@ -9,4 +9,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("gptq_marlin_repack", &gptq_marlin_repack,
        "Repack GPTQ parameters for Marlin");
  m.def("marlin_gemm", &marlin_gemm, "Marlin gemm");
  // fp8_marlin Optimized Quantized GEMM for FP8 weight-only.
  m.def("fp8_marlin_gemm", &fp8_marlin_gemm);
 }
--- a/server/marlin/marlin_kernels/ext.hh
+++ b/server/marlin/marlin_kernels/ext.hh
@ -27,4 +27,9 @@ torch::Tensor marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
                          torch::Tensor &b_scales, torch::Tensor &workspace,
                          int64_t size_m, int64_t size_n, int64_t size_k);
 torch::Tensor fp8_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                              torch::Tensor& b_scales, torch::Tensor& workspace,
                              int64_t num_bits, int64_t size_m, int64_t size_n,
                              int64_t size_k);
 #endif
--- a/server/marlin/marlin_kernels/fp8_marlin.cu
+++ b/server/marlin/marlin_kernels/fp8_marlin.cu
--- a/server/marlin/setup.py
+++ b/server/marlin/setup.py
@ -9,6 +9,7 @@ setup(
        CUDAExtension(
            name="marlin_kernels",
            sources=[
                "marlin_kernels/fp8_marlin.cu",
                "marlin_kernels/gptq_marlin.cu",
                "marlin_kernels/gptq_marlin_repack.cu",
                "marlin_kernels/marlin_cuda_kernel.cu",
--- a/server/text_generation_server/layers/fp8.py
+++ b/server/text_generation_server/layers/fp8.py
@ -1,4 +1,23 @@
 from enum import Enum, auto
 import torch
 from text_generation_server.utils.import_utils import SYSTEM
 def get_fp8_linear() -> torch.nn.Module:
    """
    Return an FP8 linear `Module` that is compatible with the current system.
    """
    if SYSTEM == "cuda":
        major, minor = torch.cuda.get_device_capability()
        if major == 8 and minor < 9:
            from text_generation_server.layers.marlin import GPTQMarlinFP8Linear
            return GPTQMarlinFP8Linear
    # On other systems let Torch decide if the hardware supports FP8.
    return Fp8Linear
 def fp8_quantize(weight, qdtype=torch.float8_e4m3fn):
--- a/server/text_generation_server/layers/linear.py
+++ b/server/text_generation_server/layers/linear.py
@ -106,9 +106,9 @@ def get_linear(weight, bias, quantize):
                "Please install EETQ from https://github.com/NetEase-FuXi/EETQ"
            )
    elif quantize == "fp8":
-        from text_generation_server.layers.fp8 import Fp8Linear
+        from text_generation_server.layers.fp8 import get_fp8_linear
-        linear = Fp8Linear(weight, bias)
+        linear = get_fp8_linear()(weight, bias)
    elif quantize == "bitsandbytes":
        try:
            from text_generation_server.layers.bnb import (
--- a/server/text_generation_server/layers/marlin.py
+++ b/server/text_generation_server/layers/marlin.py
@ -1,11 +1,13 @@
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
 from text_generation_server.utils.weights import Weights, WeightsLoader
 import torch
 import torch.nn as nn
-
+from loguru import logger
 from text_generation_server.layers.fp8 import fp8_quantize
 from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.utils.log import log_once
 from text_generation_server.utils.weights import Weights, WeightsLoader
 try:
    import marlin_kernels
@ -455,6 +457,115 @@ class GPTQMarlin24Linear(nn.Module):
        return C
 class GPTQMarlinFP8Linear(nn.Module):
    """
    FP8 GPTQ-Marlin linear layer.
    """
    def __init__(
        self,
        weight: torch.Tensor,
        bias: Optional[torch.Tensor],
    ) -> None:
        super().__init__()
        _check_marlin_kernels()
        assert marlin_kernels is not None
        log_once(logger.info, "GPU does not support FP8, using Marlin FP8 kernel")
        qweight, scale = fp8_quantize(weight)
        scale = scale.to(torch.float16)
        qweight, scales = repack_fp8_for_marlin(qweight, scale)
        in_features = qweight.shape[0] * MARLIN_TILE_SIZE
        out_features = scales.shape[1]
        _check_valid_shape(in_features=in_features, out_features=out_features)
        self.qweight = qweight
        self.scales = scales
        self.bias = bias if bias is not None else None
        self.workspace = torch.zeros(
            out_features // 64 * 16, dtype=torch.int, device=qweight.device
        )
    def forward(self, A: torch.Tensor) -> torch.Tensor:
        assert marlin_kernels is not None
        A_flat = A.view(-1, A.shape[-1])
        C = marlin_kernels.fp8_marlin_gemm(
            A_flat,
            self.qweight,
            self.scales,
            self.workspace,
            8,
            A_flat.shape[0],
            self.scales.shape[1],
            A_flat.shape[1],
        )
        C = C.reshape(A.shape[:-1] + (self.scales.shape[1],))
        if self.bias is not None:
            C += self.bias
        return C
 def pack_fp8_as_int32(fp8_tensor: torch.Tensor) -> torch.Tensor:
    """
    Repack FP8 weights to gptq format (packed int32 elements).
    """
    assert fp8_tensor.dtype == torch.float8_e4m3fn
    if fp8_tensor.shape[0] % 4 != 0:
        raise ValueError(
            f"Leading tensor dimension is not divisable by 4: {fp8_tensor.shape[0]}"
        )
    # Reshape to prepare for packing
    reshaped = fp8_tensor.reshape(-1, 4, *fp8_tensor.shape[1:])
    # Convert fp8 to uint8 (byte) representation
    byte_tensor = reshaped.view(torch.uint8)
    # Pack 4 uint8 values into one int32
    packed = torch.zeros(
        fp8_tensor.shape[0] // 4,
        fp8_tensor.shape[1],
        dtype=torch.int32,
        device=fp8_tensor.device,
    )
    for i in range(4):
        packed.bitwise_or_(byte_tensor[:, i].to(torch.int32) << i * 8)
    return packed
 def repack_fp8_for_marlin(weight: torch.Tensor, scale: torch.Tensor):
    """
    Repack FP8 tensor for GPTQ-Marlin.
    """
    out_features, in_features = weight.shape
    # Torch linear layers weights with shape [out_features, in_features],
    # GPTQ-quantized weights use [in_feateres/pack_factor, in_features],
    # so transpose before packing.
    qweight = pack_fp8_as_int32(weight.t())
    perm = torch.empty(0, dtype=torch.int, device=qweight.device)
    repacked = marlin_kernels.gptq_marlin_repack(
        qweight, perm, in_features, out_features, 8
    )
    scales = scale.reshape(1, 1).repeat(1, out_features)
    scales = permute_scales(scales)
    return repacked, scales
@dataclass
 class MarlinWeight:
    """