hf_text-generation-inference/server/text_generation_server/layers/marlin/gptq.py

from dataclasses import dataclass
from typing import List, Optional, Union

import numpy
import torch
import torch.nn as nn
from loguru import logger
from text_generation_server.layers.marlin.util import (
    _check_marlin_kernels,
    marlin_zero_points,
    permute_scales,
    unpack_cols,
)
from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.utils.log import log_once
from text_generation_server.utils.weights import Weight, Weights, WeightsLoader

try:
    import marlin_kernels
except ImportError:
    marlin_kernels = None

try:
    major, _minor = torch.cuda.get_device_capability()
    has_sm_8_0 = major >= 8
except Exception:
    has_sm_8_0 = False


GPTQ_MARLIN_BITS = [4, 8]
GPTQ_MARLIN_GROUP_SIZES = [-1, 32, 64, 128]
MARLIN_TILE_SIZE = 16


def can_use_gptq_marlin(
    *, bits: int, groupsize: int, quant_method: str, quantize: str, sym: bool
) -> bool:
    return (
        SYSTEM == "cuda"
        and marlin_kernels is not None
        and has_sm_8_0
        and quantize in {"awq", "gptq"}
        and quant_method in {"awq", "gptq"}
        and bits in GPTQ_MARLIN_BITS
        and groupsize in GPTQ_MARLIN_GROUP_SIZES
        # We only support asymmetric quantization for AWQ.
        and (sym or quant_method == "awq")
    )


class GPTQMarlinWeightsLoader(WeightsLoader):
    """
    Loader for using GPTQ- and AWQ-quantized weights with Marlin kernels.
    """

    def __init__(
        self,
        *,
        bits: int,
        desc_act: bool,
        groupsize: int,
        quant_method: str,
        quantize: str,
        sym: bool,
    ):
        self.bits = bits
        self.desc_act = desc_act
        self.groupsize = groupsize
        self.quant_method = quant_method
        self.quantize = quantize
        self.sym = sym

    def get_weights(self, weights: Weights, prefix: str):
        log_once(logger.info, "Using GPTQ-Marlin kernels")
        try:
            qweight = weights.get_tensor(f"{prefix}.qweight")
        except RuntimeError:
            raise RuntimeError(
                f"Cannot load `{self.quantize}` weight for GPTQ -> Marlin repacking, make sure the model is already quantized"
            )

        if not self.sym:
            qzeros = weights.get_tensor(f"{prefix}.qzeros")
        else:
            qzeros = None

        if self.quant_method == "awq":
            g_idx = None
        else:
            g_idx = weights.get_tensor(f"{prefix}.g_idx")
        scales = weights.get_tensor(f"{prefix}.scales")

        return repack_gptq_for_marlin(
            qweight=qweight,
            scales=scales,
            qzeros=qzeros,
            g_idx=g_idx,
            bits=self.bits,
            desc_act=self.desc_act,
            groupsize=self.groupsize,
            quant_method=self.quant_method,
            sym=self.sym,
            sharded_infeatures=False,
        )

    def get_weights_col_packed(
        self,
        weights: Weights,
        prefix: str,
        block_sizes: Union[int, List[int]],
    ):
        try:
            qweight = weights.get_packed_sharded(
                f"{prefix}.qweight", dim=1, block_sizes=block_sizes
            )
        except RuntimeError:
            raise RuntimeError(
                f"Cannot load `{self.quantize}` weight, make sure the model is already quantized."
            )
        scales = weights.get_packed_sharded(
            f"{prefix}.scales", dim=1, block_sizes=block_sizes
        )
        scales = scales.to(dtype=weights.dtype)

        if not self.sym:
            qzeros = weights.get_packed_sharded(
                f"{prefix}.qzeros", dim=1, block_sizes=block_sizes
            )
        else:
            qzeros = None

        if self.quant_method == "awq":
            g_idx = None
        else:
            g_idx = weights.get_tensor(f"{prefix}.g_idx")
        return repack_gptq_for_marlin(
            qweight=qweight,
            scales=scales,
            qzeros=qzeros,
            g_idx=g_idx,
            bits=self.bits,
            desc_act=self.desc_act,
            groupsize=self.groupsize,
            quant_method=self.quant_method,
            sym=self.sym,
            sharded_infeatures=False,
        )

    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
        try:
            qweight = torch.cat(
                [weights.get_sharded(f"{p}.qweight", dim=1) for p in prefixes], dim=1
            )
        except RuntimeError:
            raise RuntimeError(
                f"Cannot load `{self.quantize}` weight, make sure the model is already quantized"
            )

        scales = torch.cat(
            [weights.get_sharded(f"{p}.scales", dim=1) for p in prefixes], dim=1
        )

        if not self.sym:
            qzeros = torch.cat(
                [weights.get_sharded(f"{p}.qzeros", dim=1) for p in prefixes], dim=1
            )
        else:
            qzeros = None

        if self.quant_method == "awq":
            g_idx = None
        else:
            w = [weights.get_tensor(f"{p}.g_idx") for p in prefixes]
            for w2 in w[1:]:
                torch.testing.assert_close(w2, w[0])
            g_idx = w[0]

        return repack_gptq_for_marlin(
            qweight=qweight,
            scales=scales,
            qzeros=qzeros,
            g_idx=g_idx,
            bits=self.bits,
            desc_act=self.desc_act,
            groupsize=self.groupsize,
            quant_method=self.quant_method,
            sym=self.sym,
            sharded_infeatures=False,
        )

    def get_weights_row(self, weights: Weights, prefix: str):
        log_once(logger.info, "Using GPTQ-Marlin kernels")
        try:
            qweight = weights.get_sharded(f"{prefix}.qweight", dim=0)
        except RuntimeError:
            raise RuntimeError(
                f"Cannot load `{self.quantize}` weight for GPTQ -> Marlin repacking, make sure the model is already quantized"
            )

        if not self.sym:
            if self.desc_act or self.groupsize == -1:
                qzeros = weights.get_tensor(f"{prefix}.qzeros")
            else:
                qzeros = weights.get_sharded(f"{prefix}.qzeros", dim=0)
        else:
            qzeros = None

        if self.quant_method == "awq":
            g_idx = None
        else:
            g_idx = weights.get_sharded(f"{prefix}.g_idx", dim=0)

        if self.desc_act or self.groupsize == -1:
            scales = weights.get_tensor(f"{prefix}.scales")
        else:
            scales = weights.get_sharded(f"{prefix}.scales", dim=0)

        sharded_in_features = weights.process_group.size() > 1

        return repack_gptq_for_marlin(
            qweight=qweight,
            scales=scales,
            qzeros=qzeros,
            g_idx=g_idx,
            bits=self.bits,
            desc_act=self.desc_act,
            groupsize=self.groupsize,
            quant_method=self.quant_method,
            sym=self.sym,
            sharded_infeatures=sharded_in_features,
        )

    def _get_gptq_params(self, weights: Weights):
        if weights.has_tensor("gptq_bits") and weights.has_tensor("gptq_groupsize"):
            self.bits = weights.get_tensor("gptq_bits").item()
            self.groupsize = weights.get_tensor("gptq_groupsize").item()
            self.desc_act = False
            # `server quantize` used asymmetric quantization unconditionally
            # before the `gptq_sym` setting tensor was added.
            self.sym = (
                weights.get_tensor("gptq_sym").item()
                if weights.has_tensor("gptq_sym")
                else False
            )
            self.quant_method = "gptq"


@dataclass
class GPTQMarlinWeight(Weight):
    """
    Repacked GPTQ Marlin weights.
    """

    qweight: torch.Tensor
    qzeros: torch.Tensor
    scales: torch.Tensor
    g_idx: torch.Tensor
    perm: torch.Tensor
    bits: int
    is_full_k: bool

    def __post_init__(self):
        assert self.qweight.dtype == torch.int32
        assert self.scales.dtype == torch.float16
        assert self.g_idx.dtype == torch.int32
        assert self.perm.dtype == torch.int32

    def get_linear(self, bias: torch.Tensor):
        return GPTQMarlinLinear(
            weight=self,
            bias=bias,
        )


def repack_gptq_for_marlin(
    *,
    qweight: torch.Tensor,
    qzeros: Optional[torch.Tensor],
    scales: torch.Tensor,
    g_idx: Optional[torch.Tensor],
    bits: int,
    desc_act: bool,
    groupsize: int,
    quant_method: str,
    sym: bool,
    sharded_infeatures: bool,
) -> GPTQMarlinWeight:
    """Convert GPTQ weights to a layout that's compatible with GPTQ-Marlin kernels."""
    _check_marlin_kernels()
    assert marlin_kernels is not None

    if bits not in GPTQ_MARLIN_BITS:
        supported_bits = ", ".join(str(b) for b in GPTQ_MARLIN_BITS)
        raise RuntimeError(
            f"Repacking {bits}-bit GPTQ weights as Marlin is not supported, must be one of: {supported_bits}"
        )

    if groupsize not in GPTQ_MARLIN_GROUP_SIZES:
        supported_sizes = ", ".join(str(b) for b in GPTQ_MARLIN_GROUP_SIZES)
        raise RuntimeError(
            f"Repacking GPTQ weights with group size {groupsize} as Marlin is not supported, must be one of: {supported_sizes}"
        )
    if not (sym or quant_method == "awq"):
        raise RuntimeError(
            "Repacking GPTQ weights with asymmetric quantization as Marlin is not supported."
        )

    log_once(logger.info, f"Converting {quant_method} model to Marlin packing format.")

    weights_per_int = 32 // bits
    in_features = qweight.shape[0]
    out_features = qweight.shape[1]

    # AWQ uses column packing, GPTQ uses row packing
    if quant_method == "awq":
        out_features *= weights_per_int
    else:
        in_features *= weights_per_int

    if in_features % groupsize != 0:
        raise ValueError(
            f"Number of input features ({in_features}) not divisible by group size ({groupsize})"
        )

    if g_idx is not None and desc_act and groupsize != -1:
        perm = torch.argsort(g_idx).to(torch.int)
        g_idx = g_idx[perm]
    else:
        perm = torch.empty(0, dtype=torch.int, device=qweight.device)
        g_idx = torch.empty(0, dtype=torch.int, device=qweight.device)

    if quant_method == "awq":
        repacked = marlin_kernels.awq_marlin_repack(
            qweight, in_features, out_features, bits
        )
        if qzeros is not None:
            qzeros = awq_to_marlin_zero_points(
                qzeros,
                in_features // groupsize,
                out_features,
                bits,
            )

    else:
        repacked = marlin_kernels.gptq_marlin_repack(
            qweight, perm, in_features, out_features, bits
        )

    if qzeros is None:
        qzeros = torch.empty(0, dtype=torch.int, device=qweight.device)

    scales = permute_scales(scales)

    is_full_k = not (desc_act and groupsize != -1 and sharded_infeatures)

    return GPTQMarlinWeight(
        qweight=repacked,
        qzeros=qzeros,
        scales=scales,
        g_idx=g_idx,
        perm=perm,
        bits=bits,
        is_full_k=is_full_k,
    )


class GPTQMarlinLinear(nn.Module):
    """
    Linear layer for GPTQ weights that were converted for the GPTQ-Marlin
    kernels.
    """

    def __init__(
        self,
        *,
        weight: GPTQMarlinWeight,
        bias: Optional[torch.Tensor],
    ):
        super().__init__()

        _check_marlin_kernels()
        assert marlin_kernels is not None

        in_features = weight.qweight.shape[0] * MARLIN_TILE_SIZE
        out_features = weight.scales.shape[1]
        _check_valid_shape(in_features=in_features, out_features=out_features)

        self.bits = weight.bits
        self.is_full_k = weight.is_full_k

        self.qweight = weight.qweight
        self.qzeros = weight.qzeros
        self.scales = weight.scales
        self.g_idx = weight.g_idx
        self.perm = weight.perm
        if bias is not None:
            self.bias = bias
        else:
            self.bias = None

        self.workspace = torch.zeros(
            out_features // 64 * 16, dtype=torch.int, device=weight.qweight.device
        )

    def forward(self, A: torch.Tensor) -> torch.Tensor:
        assert marlin_kernels is not None

        A_flat = A.view(-1, A.shape[-1])
        C = marlin_kernels.gptq_marlin_gemm(
            A_flat,
            self.qweight,
            self.scales,
            self.qzeros,
            self.g_idx,
            self.perm,
            self.workspace,
            self.bits,
            A_flat.shape[0],
            self.scales.shape[1],
            A_flat.shape[1],
            self.is_full_k,
            self.qzeros.numel() > 0,
            True,
        )
        C = C.reshape(A.shape[:-1] + (self.scales.shape[1],))

        if self.bias is not None:
            C += self.bias

        return C


def awq_to_marlin_zero_points(
    q_zp_packed: torch.Tensor, size_k: int, size_n: int, num_bits: int
) -> torch.Tensor:
    # AWQ zero-points are quantized and packed on the column dim.
    # In addition, the values are permuted based on dequantizer.
    # Here we undo both of these, and then apply marlin permutation
    # and pack it back.
    q_zp = unpack_cols(q_zp_packed, num_bits, size_k, size_n)

    # Undo interleaving (use argsort(..) to get inverse perm)
    if num_bits == 4:
        undo_interleave = numpy.argsort(numpy.array([0, 2, 4, 6, 1, 3, 5, 7]))
    elif num_bits == 8:
        undo_interleave = numpy.argsort(numpy.array([0, 2, 1, 3]))
    else:
        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))

    q_zp = q_zp.reshape((-1, len(undo_interleave)))[:, undo_interleave].ravel()
    q_zp = q_zp.reshape((-1, size_n)).contiguous()

    marlin_zp = marlin_zero_points(q_zp, size_k, size_n, num_bits)
    return marlin_zp


def _check_valid_shape(in_features: int, out_features: int):
    if (in_features % 128 != 0 or out_features % 64 != 0) and (
        in_features % 64 != 0 or out_features % 128 != 0
    ):
        raise ValueError(
            f"The GPTQ Marlin kernel does not have a valid thread configuration for weight matrix with shape ({out_features}, {in_features})."
            " The shape elements must be divisible by (128, 64) or (64, 128)."
        )
Split up `layers.marlin` into several files (#2292) The marlin.py file was getting large, split it up. 2024-07-24 08:33:26 -06:00			`from dataclasses import dataclass`
Handle GPTQ-Marlin loading in `GPTQMarlinWeightLoader` (#2300) The `GPTWeightLoader` was structured like this in pseudocode: if marlin: Set up tensors in a way that GPTQ-Marlin expects else: Set up tensors in a way that ExLlama/GPTQ/AWQ expect However, the GPT-Marlin implementation details should really be in the `marlin` module. So move the former part out to a separate `GPTQMarlinWeightsLoader`. 2024-07-31 05:08:41 -06:00			`from typing import List, Optional, Union`
Split up `layers.marlin` into several files (#2292) The marlin.py file was getting large, split it up. 2024-07-24 08:33:26 -06:00
			`import numpy`
			`import torch`
			`import torch.nn as nn`
			`from loguru import logger`
			`from text_generation_server.layers.marlin.util import (`
			`_check_marlin_kernels,`
			`marlin_zero_points,`
			`permute_scales,`
			`unpack_cols,`
			`)`
			`from text_generation_server.utils.import_utils import SYSTEM`
			`from text_generation_server.utils.log import log_once`
Handle GPTQ-Marlin loading in `GPTQMarlinWeightLoader` (#2300) The `GPTWeightLoader` was structured like this in pseudocode: if marlin: Set up tensors in a way that GPTQ-Marlin expects else: Set up tensors in a way that ExLlama/GPTQ/AWQ expect However, the GPT-Marlin implementation details should really be in the `marlin` module. So move the former part out to a separate `GPTQMarlinWeightsLoader`. 2024-07-31 05:08:41 -06:00			`from text_generation_server.utils.weights import Weight, Weights, WeightsLoader`
Split up `layers.marlin` into several files (#2292) The marlin.py file was getting large, split it up. 2024-07-24 08:33:26 -06:00
			`try:`
			`import marlin_kernels`
			`except ImportError:`
			`marlin_kernels = None`

			`try:`
			`major, _minor = torch.cuda.get_device_capability()`
			`has_sm_8_0 = major >= 8`
			`except Exception:`
			`has_sm_8_0 = False`


			`GPTQ_MARLIN_BITS = [4, 8]`
			`GPTQ_MARLIN_GROUP_SIZES = [-1, 32, 64, 128]`
			`MARLIN_TILE_SIZE = 16`


			`def can_use_gptq_marlin(`
			`*, bits: int, groupsize: int, quant_method: str, quantize: str, sym: bool`
			`) -> bool:`
			`return (`
			`SYSTEM == "cuda"`
			`and marlin_kernels is not None`
			`and has_sm_8_0`
			`and quantize in {"awq", "gptq"}`
			`and quant_method in {"awq", "gptq"}`
			`and bits in GPTQ_MARLIN_BITS`
			`and groupsize in GPTQ_MARLIN_GROUP_SIZES`
Add support for fused MoE Marlin for AWQ (#2616) * Add support for fused MoE Marlin for AWQ This uses the updated MoE Marlin kernels from vLLM. * Add integration test for AWQ MoE 2024-10-08 03:56:41 -06:00			`# We only support asymmetric quantization for AWQ.`
Split up `layers.marlin` into several files (#2292) The marlin.py file was getting large, split it up. 2024-07-24 08:33:26 -06:00			`and (sym or quant_method == "awq")`
			`)`


Handle GPTQ-Marlin loading in `GPTQMarlinWeightLoader` (#2300) The `GPTWeightLoader` was structured like this in pseudocode: if marlin: Set up tensors in a way that GPTQ-Marlin expects else: Set up tensors in a way that ExLlama/GPTQ/AWQ expect However, the GPT-Marlin implementation details should really be in the `marlin` module. So move the former part out to a separate `GPTQMarlinWeightsLoader`. 2024-07-31 05:08:41 -06:00			`class GPTQMarlinWeightsLoader(WeightsLoader):`
			`"""`
			`Loader for using GPTQ- and AWQ-quantized weights with Marlin kernels.`
			`"""`

			`def __init__(`
			`self,`
			`*,`
			`bits: int,`
			`desc_act: bool,`
			`groupsize: int,`
			`quant_method: str,`
			`quantize: str,`
			`sym: bool,`
			`):`
			`self.bits = bits`
			`self.desc_act = desc_act`
			`self.groupsize = groupsize`
			`self.quant_method = quant_method`
			`self.quantize = quantize`
			`self.sym = sym`

			`def get_weights(self, weights: Weights, prefix: str):`
			`log_once(logger.info, "Using GPTQ-Marlin kernels")`
			`try:`
			`qweight = weights.get_tensor(f"{prefix}.qweight")`
			`except RuntimeError:`
			`raise RuntimeError(`
			f"Cannot load `{self.quantize}` weight for GPTQ -> Marlin repacking, make sure the model is already quantized"
			`)`

			`if not self.sym:`
			`qzeros = weights.get_tensor(f"{prefix}.qzeros")`
			`else:`
			`qzeros = None`

			`if self.quant_method == "awq":`
			`g_idx = None`
			`else:`
			`g_idx = weights.get_tensor(f"{prefix}.g_idx")`
			`scales = weights.get_tensor(f"{prefix}.scales")`

			`return repack_gptq_for_marlin(`
			`qweight=qweight,`
			`scales=scales,`
			`qzeros=qzeros,`
			`g_idx=g_idx,`
			`bits=self.bits,`
			`desc_act=self.desc_act,`
			`groupsize=self.groupsize,`
			`quant_method=self.quant_method,`
			`sym=self.sym,`
			`sharded_infeatures=False,`
			`)`

			`def get_weights_col_packed(`
			`self,`
			`weights: Weights,`
			`prefix: str,`
			`block_sizes: Union[int, List[int]],`
			`):`
			`try:`
			`qweight = weights.get_packed_sharded(`
			`f"{prefix}.qweight", dim=1, block_sizes=block_sizes`
			`)`
			`except RuntimeError:`
			`raise RuntimeError(`
			f"Cannot load `{self.quantize}` weight, make sure the model is already quantized."
			`)`
			`scales = weights.get_packed_sharded(`
			`f"{prefix}.scales", dim=1, block_sizes=block_sizes`
			`)`
			`scales = scales.to(dtype=weights.dtype)`

			`if not self.sym:`
			`qzeros = weights.get_packed_sharded(`
			`f"{prefix}.qzeros", dim=1, block_sizes=block_sizes`
			`)`
			`else:`
			`qzeros = None`

			`if self.quant_method == "awq":`
			`g_idx = None`
			`else:`
			`g_idx = weights.get_tensor(f"{prefix}.g_idx")`
			`return repack_gptq_for_marlin(`
			`qweight=qweight,`
			`scales=scales,`
			`qzeros=qzeros,`
			`g_idx=g_idx,`
			`bits=self.bits,`
			`desc_act=self.desc_act,`
			`groupsize=self.groupsize,`
			`quant_method=self.quant_method,`
			`sym=self.sym,`
			`sharded_infeatures=False,`
			`)`

			`def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):`
			`try:`
			`qweight = torch.cat(`
			`[weights.get_sharded(f"{p}.qweight", dim=1) for p in prefixes], dim=1`
			`)`
			`except RuntimeError:`
			`raise RuntimeError(`
			f"Cannot load `{self.quantize}` weight, make sure the model is already quantized"
			`)`

			`scales = torch.cat(`
			`[weights.get_sharded(f"{p}.scales", dim=1) for p in prefixes], dim=1`
			`)`

			`if not self.sym:`
			`qzeros = torch.cat(`
			`[weights.get_sharded(f"{p}.qzeros", dim=1) for p in prefixes], dim=1`
			`)`
			`else:`
			`qzeros = None`

			`if self.quant_method == "awq":`
			`g_idx = None`
			`else:`
			`w = [weights.get_tensor(f"{p}.g_idx") for p in prefixes]`
			`for w2 in w[1:]:`
			`torch.testing.assert_close(w2, w[0])`
			`g_idx = w[0]`

			`return repack_gptq_for_marlin(`
			`qweight=qweight,`
			`scales=scales,`
			`qzeros=qzeros,`
			`g_idx=g_idx,`
			`bits=self.bits,`
			`desc_act=self.desc_act,`
			`groupsize=self.groupsize,`
			`quant_method=self.quant_method,`
			`sym=self.sym,`
			`sharded_infeatures=False,`
			`)`

			`def get_weights_row(self, weights: Weights, prefix: str):`
			`log_once(logger.info, "Using GPTQ-Marlin kernels")`
			`try:`
			`qweight = weights.get_sharded(f"{prefix}.qweight", dim=0)`
			`except RuntimeError:`
			`raise RuntimeError(`
			f"Cannot load `{self.quantize}` weight for GPTQ -> Marlin repacking, make sure the model is already quantized"
			`)`

			`if not self.sym:`
			`if self.desc_act or self.groupsize == -1:`
			`qzeros = weights.get_tensor(f"{prefix}.qzeros")`
			`else:`
			`qzeros = weights.get_sharded(f"{prefix}.qzeros", dim=0)`
			`else:`
			`qzeros = None`

			`if self.quant_method == "awq":`
			`g_idx = None`
			`else:`
			`g_idx = weights.get_sharded(f"{prefix}.g_idx", dim=0)`

			`if self.desc_act or self.groupsize == -1:`
			`scales = weights.get_tensor(f"{prefix}.scales")`
			`else:`
			`scales = weights.get_sharded(f"{prefix}.scales", dim=0)`

			`sharded_in_features = weights.process_group.size() > 1`

			`return repack_gptq_for_marlin(`
			`qweight=qweight,`
			`scales=scales,`
			`qzeros=qzeros,`
			`g_idx=g_idx,`
			`bits=self.bits,`
			`desc_act=self.desc_act,`
			`groupsize=self.groupsize,`
			`quant_method=self.quant_method,`
			`sym=self.sym,`
			`sharded_infeatures=sharded_in_features,`
			`)`

			`def _get_gptq_params(self, weights: Weights):`
Fp8 e4m3_fnuz support for rocm (#2588) * (feat) fp8 fnuz support for rocm * (review comments) Fix compression_config load, type hints * (bug) update all has_tensor * (review_comments) fix typo and added comments * (nit) improved comment 2024-10-16 01:54:50 -06:00			`if weights.has_tensor("gptq_bits") and weights.has_tensor("gptq_groupsize"):`
Handle GPTQ-Marlin loading in `GPTQMarlinWeightLoader` (#2300) The `GPTWeightLoader` was structured like this in pseudocode: if marlin: Set up tensors in a way that GPTQ-Marlin expects else: Set up tensors in a way that ExLlama/GPTQ/AWQ expect However, the GPT-Marlin implementation details should really be in the `marlin` module. So move the former part out to a separate `GPTQMarlinWeightsLoader`. 2024-07-31 05:08:41 -06:00			`self.bits = weights.get_tensor("gptq_bits").item()`
			`self.groupsize = weights.get_tensor("gptq_groupsize").item()`
			`self.desc_act = False`
			# `server quantize` used asymmetric quantization unconditionally
			# before the `gptq_sym` setting tensor was added.
			`self.sym = (`
			`weights.get_tensor("gptq_sym").item()`
Fp8 e4m3_fnuz support for rocm (#2588) * (feat) fp8 fnuz support for rocm * (review comments) Fix compression_config load, type hints * (bug) update all has_tensor * (review_comments) fix typo and added comments * (nit) improved comment 2024-10-16 01:54:50 -06:00			`if weights.has_tensor("gptq_sym")`
Handle GPTQ-Marlin loading in `GPTQMarlinWeightLoader` (#2300) The `GPTWeightLoader` was structured like this in pseudocode: if marlin: Set up tensors in a way that GPTQ-Marlin expects else: Set up tensors in a way that ExLlama/GPTQ/AWQ expect However, the GPT-Marlin implementation details should really be in the `marlin` module. So move the former part out to a separate `GPTQMarlinWeightsLoader`. 2024-07-31 05:08:41 -06:00			`else False`
			`)`
			`self.quant_method = "gptq"`


Split up `layers.marlin` into several files (#2292) The marlin.py file was getting large, split it up. 2024-07-24 08:33:26 -06:00			`@dataclass`
			`class GPTQMarlinWeight(Weight):`
			`"""`
			`Repacked GPTQ Marlin weights.`
			`"""`

			`qweight: torch.Tensor`
			`qzeros: torch.Tensor`
			`scales: torch.Tensor`
			`g_idx: torch.Tensor`
			`perm: torch.Tensor`
			`bits: int`
			`is_full_k: bool`

			`def __post_init__(self):`
			`assert self.qweight.dtype == torch.int32`
			`assert self.scales.dtype == torch.float16`
			`assert self.g_idx.dtype == torch.int32`
			`assert self.perm.dtype == torch.int32`

			`def get_linear(self, bias: torch.Tensor):`
			`return GPTQMarlinLinear(`
			`weight=self,`
			`bias=bias,`
			`)`


			`def repack_gptq_for_marlin(`
			`*,`
			`qweight: torch.Tensor,`
			`qzeros: Optional[torch.Tensor],`
			`scales: torch.Tensor,`
			`g_idx: Optional[torch.Tensor],`
			`bits: int,`
			`desc_act: bool,`
			`groupsize: int,`
			`quant_method: str,`
			`sym: bool,`
			`sharded_infeatures: bool,`
			`) -> GPTQMarlinWeight:`
			`"""Convert GPTQ weights to a layout that's compatible with GPTQ-Marlin kernels."""`
			`_check_marlin_kernels()`
			`assert marlin_kernels is not None`

			`if bits not in GPTQ_MARLIN_BITS:`
			`supported_bits = ", ".join(str(b) for b in GPTQ_MARLIN_BITS)`
			`raise RuntimeError(`
			`f"Repacking {bits}-bit GPTQ weights as Marlin is not supported, must be one of: {supported_bits}"`
			`)`

			`if groupsize not in GPTQ_MARLIN_GROUP_SIZES:`
			`supported_sizes = ", ".join(str(b) for b in GPTQ_MARLIN_GROUP_SIZES)`
			`raise RuntimeError(`
			`f"Repacking GPTQ weights with group size {groupsize} as Marlin is not supported, must be one of: {supported_sizes}"`
			`)`
			`if not (sym or quant_method == "awq"):`
			`raise RuntimeError(`
			`"Repacking GPTQ weights with asymmetric quantization as Marlin is not supported."`
			`)`

			`log_once(logger.info, f"Converting {quant_method} model to Marlin packing format.")`

			`weights_per_int = 32 // bits`
			`in_features = qweight.shape[0]`
			`out_features = qweight.shape[1]`

			`# AWQ uses column packing, GPTQ uses row packing`
			`if quant_method == "awq":`
			`out_features *= weights_per_int`
			`else:`
			`in_features *= weights_per_int`

			`if in_features % groupsize != 0:`
			`raise ValueError(`
			`f"Number of input features ({in_features}) not divisible by group size ({groupsize})"`
			`)`

			`if g_idx is not None and desc_act and groupsize != -1:`
			`perm = torch.argsort(g_idx).to(torch.int)`
			`g_idx = g_idx[perm]`
			`else:`
			`perm = torch.empty(0, dtype=torch.int, device=qweight.device)`
			`g_idx = torch.empty(0, dtype=torch.int, device=qweight.device)`

			`if quant_method == "awq":`
			`repacked = marlin_kernels.awq_marlin_repack(`
			`qweight, in_features, out_features, bits`
			`)`
			`if qzeros is not None:`
			`qzeros = awq_to_marlin_zero_points(`
			`qzeros,`
			`in_features // groupsize,`
			`out_features,`
			`bits,`
			`)`

			`else:`
			`repacked = marlin_kernels.gptq_marlin_repack(`
			`qweight, perm, in_features, out_features, bits`
			`)`

			`if qzeros is None:`
			`qzeros = torch.empty(0, dtype=torch.int, device=qweight.device)`

			`scales = permute_scales(scales)`

MoE Marlin: support `desc_act` for `groupsize != -1` (#2590) This change uses the updated Marlin MoE kernel from vLLM to support MoE with activation sorting and groups. 2024-09-30 11:40:25 -06:00			`is_full_k = not (desc_act and groupsize != -1 and sharded_infeatures)`
Split up `layers.marlin` into several files (#2292) The marlin.py file was getting large, split it up. 2024-07-24 08:33:26 -06:00
			`return GPTQMarlinWeight(`
			`qweight=repacked,`
			`qzeros=qzeros,`
			`scales=scales,`
			`g_idx=g_idx,`
			`perm=perm,`
			`bits=bits,`
			`is_full_k=is_full_k,`
			`)`


			`class GPTQMarlinLinear(nn.Module):`
			`"""`
			`Linear layer for GPTQ weights that were converted for the GPTQ-Marlin`
			`kernels.`
			`"""`

			`def __init__(`
			`self,`
			`*,`
			`weight: GPTQMarlinWeight,`
			`bias: Optional[torch.Tensor],`
			`):`
			`super().__init__()`

			`_check_marlin_kernels()`
			`assert marlin_kernels is not None`

			`in_features = weight.qweight.shape[0] * MARLIN_TILE_SIZE`
			`out_features = weight.scales.shape[1]`
			`_check_valid_shape(in_features=in_features, out_features=out_features)`

			`self.bits = weight.bits`
			`self.is_full_k = weight.is_full_k`

			`self.qweight = weight.qweight`
			`self.qzeros = weight.qzeros`
			`self.scales = weight.scales`
			`self.g_idx = weight.g_idx`
			`self.perm = weight.perm`
			`if bias is not None:`
			`self.bias = bias`
			`else:`
			`self.bias = None`

			`self.workspace = torch.zeros(`
			`out_features // 64 * 16, dtype=torch.int, device=weight.qweight.device`
			`)`

			`def forward(self, A: torch.Tensor) -> torch.Tensor:`
			`assert marlin_kernels is not None`

			`A_flat = A.view(-1, A.shape[-1])`
			`C = marlin_kernels.gptq_marlin_gemm(`
			`A_flat,`
			`self.qweight,`
			`self.scales,`
			`self.qzeros,`
			`self.g_idx,`
			`self.perm,`
			`self.workspace,`
			`self.bits,`
			`A_flat.shape[0],`
			`self.scales.shape[1],`
			`A_flat.shape[1],`
			`self.is_full_k,`
			`self.qzeros.numel() > 0,`
Install Marlin from standalone package (#2320) 2024-07-29 07:37:10 -06:00			`True,`
Split up `layers.marlin` into several files (#2292) The marlin.py file was getting large, split it up. 2024-07-24 08:33:26 -06:00			`)`
			`C = C.reshape(A.shape[:-1] + (self.scales.shape[1],))`

			`if self.bias is not None:`
			`C += self.bias`

			`return C`


			`def awq_to_marlin_zero_points(`
			`q_zp_packed: torch.Tensor, size_k: int, size_n: int, num_bits: int`
			`) -> torch.Tensor:`
			`# AWQ zero-points are quantized and packed on the column dim.`
			`# In addition, the values are permuted based on dequantizer.`
			`# Here we undo both of these, and then apply marlin permutation`
			`# and pack it back.`
			`q_zp = unpack_cols(q_zp_packed, num_bits, size_k, size_n)`

			`# Undo interleaving (use argsort(..) to get inverse perm)`
			`if num_bits == 4:`
			`undo_interleave = numpy.argsort(numpy.array([0, 2, 4, 6, 1, 3, 5, 7]))`
			`elif num_bits == 8:`
			`undo_interleave = numpy.argsort(numpy.array([0, 2, 1, 3]))`
			`else:`
			`raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))`

			`q_zp = q_zp.reshape((-1, len(undo_interleave)))[:, undo_interleave].ravel()`
			`q_zp = q_zp.reshape((-1, size_n)).contiguous()`

			`marlin_zp = marlin_zero_points(q_zp, size_k, size_n, num_bits)`
			`return marlin_zp`


			`def _check_valid_shape(in_features: int, out_features: int):`
			`if (in_features % 128 != 0 or out_features % 64 != 0) and (`
			`in_features % 64 != 0 or out_features % 128 != 0`
			`):`
			`raise ValueError(`
			`f"The GPTQ Marlin kernel does not have a valid thread configuration for weight matrix with shape ({out_features}, {in_features})."`
			`" The shape elements must be divisible by (128, 64) or (64, 128)."`
			`)`