hf_text-generation-inference/server/tests/utils/test_weights.py

import pytest
import torch
from text_generation_server.utils.weights import (
    DefaultWeightsLoader,
    UnquantizedWeight,
    Weights,
    WeightsLoader,
)
from text_generation_server.layers.gptq import GPTQWeight, GPTQWeightsLoader
from text_generation_server.layers.exl2 import Exl2Weight, Exl2WeightsLoader
from text_generation_server.layers.marlin import MarlinWeight, MarlinWeightsLoader
from types import SimpleNamespace
from typing import List, Optional, Dict, Union
from pathlib import Path


@pytest.fixture
def gptq_weights_loader():
    return GPTQWeightsLoader(
        bits=4,
        groupsize=-1,
        desc_act=False,
        quant_method="gptq",
        quantize="gptq",
        sym=True,
    )


@pytest.fixture
def gptq_weights_loader_awq():
    return GPTQWeightsLoader(
        bits=4,
        groupsize=-1,
        desc_act=False,
        quant_method="awq",
        quantize="awq",
        sym=True,
    )


@pytest.fixture
def marlin_weights_loader():
    return MarlinWeightsLoader(bits=4, is_marlin_24=False)


dummy_file_system = {
    "test_weights": {
        "layer.0.weight": torch.tensor(
            [
                [1, 2],
                [3, 4],
            ],
            dtype=torch.float32,
        ),
    },
    "test_weights_2": {
        "layer.1337.weight": torch.tensor(
            [
                [1, 2, 3, 4],
                [5, 6, 7, 8],
            ],
            dtype=torch.float32,
        ),
    },
    "test_get_weights_col_packed": {
        "weight.weight": torch.tensor(
            [
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
            ],
            dtype=torch.float32,
        ),
    },
    "test_get_multi_weights_col": {
        "weight.weight": torch.tensor(
            [
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
            ],
            dtype=torch.float32,
        ),
        "weight.weight": torch.tensor(
            [
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
            ],
            dtype=torch.float32,
        ),
    },
    "test_get_weights_row": {
        "weight.weight": torch.tensor(
            [
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
            ],
            dtype=torch.float32,
        ),
    },
    "test_get_weights_col_gptq": {
        "weight.qweight": torch.tensor(
            [
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
            ],
            dtype=torch.float32,
        ),
        "weight.g_idx": torch.tensor([0, 1, 0, 1], dtype=torch.int32),
        "weight.qzeros": torch.tensor(
            [
                [0, 1],
                [1, 0],
            ],
            dtype=torch.int32,
        ),
        "weight.scales": torch.tensor(
            [
                [100.0, 100.0],
                [100.0, 100.0],
            ],
            dtype=torch.float16,
        ),
        "gptq_bits": torch.tensor([8], dtype=torch.float32),
        "gptq_groupsize": torch.tensor([2], dtype=torch.float32),
    },
    "test_get_weights_col_marlin": {
        "weight.B": torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
        "weight.s": torch.tensor([[0.5000], [0.2500]], dtype=torch.float16),
    },
    "test_get_weights_row_gptq": {
        "weight.qweight": torch.tensor(
            [
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
            ],
            dtype=torch.int32,
        ),
        "weight.g_idx": torch.tensor([0, 1, 0, 1], dtype=torch.int32),
        "weight.qzeros": torch.tensor(
            [
                [0, 1],
                [1, 0],
            ],
            dtype=torch.int32,
        ),
        "weight.scales": torch.tensor(
            [
                [100.0, 100.0],
                [100.0, 100.0],
            ],
            dtype=torch.float16,
        ),
        "gptq_bits": torch.tensor([8], dtype=torch.float32),
        "gptq_groupsize": torch.tensor([2], dtype=torch.float32),
    },
    "test_get_multi_weights_col_gptq": {
        "weight.qweight": torch.tensor(
            [
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
            ],
            dtype=torch.int32,
        ),
        "weight.g_idx": torch.tensor([0, 1, 0, 1], dtype=torch.int32),
        "weight.qzeros": torch.tensor(
            [
                [0, 1],
                [1, 0],
            ],
            dtype=torch.int32,
        ),
        "weight.scales": torch.tensor(
            [
                [100.0, 100.0],
                [100.0, 100.0],
            ],
            dtype=torch.float16,
        ),
        "gptq_bits": torch.tensor([8], dtype=torch.float32),
        "gptq_groupsize": torch.tensor([2], dtype=torch.float32),
    },
    "test_get_weights_col_packed_gptq": {
        "weight.qweight": torch.tensor(
            [
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
            ],
            dtype=torch.int32,
        ),
        "weight.g_idx": torch.tensor([0, 1, 0, 1], dtype=torch.int32),
        "weight.qzeros": torch.tensor(
            [
                [0, 1],
                [1, 0],
            ],
            dtype=torch.int32,
        ),
        "weight.scales": torch.tensor(
            [
                [100.0, 100.0],
                [100.0, 100.0],
            ],
            dtype=torch.float16,
        ),
        "gptq_bits": torch.tensor([8], dtype=torch.float32),
        "gptq_groupsize": torch.tensor([2], dtype=torch.float32),
    },
    "test_get_weights_col_packed_exl2": {
        "weight.q_weight": torch.tensor(
            [
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
            ],
            dtype=torch.int32,
        ),
        "weight.q_scale": torch.tensor([8], dtype=torch.int32),
        "weight.q_invperm": torch.tensor([1, 0, 3, 2], dtype=torch.int32),
        "weight.q_scale_max": torch.tensor([100], dtype=torch.float16),
        "weight.q_groups": torch.tensor([4], dtype=torch.int16),
    },
    "test_get_weights_row_exl2": {
        "weight.q_weight": torch.tensor(
            [
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
            ],
            dtype=torch.int32,
        ),
        "weight.q_scale": torch.tensor([8], dtype=torch.int32),
        "weight.q_invperm": torch.tensor([1, 0, 3, 2], dtype=torch.int32),
        "weight.q_scale_max": torch.tensor([100], dtype=torch.float16),
        "weight.q_groups": torch.tensor([4], dtype=torch.int16),
    },
    "test_get_multi_weights_col_exl2": {
        "weight.q_weight": torch.tensor(
            [
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
            ],
            dtype=torch.int32,
        ),
        "weight.q_scale": torch.tensor([8], dtype=torch.int32),
        "weight.q_invperm": torch.tensor([1, 0, 3, 2], dtype=torch.int32),
        "weight.q_scale_max": torch.tensor([100], dtype=torch.float16),
        "weight.q_groups": torch.tensor([4], dtype=torch.int16),
    },
    "test_get_weights_col_exl2": {
        "weight.q_weight": torch.tensor(
            [
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
            ],
            dtype=torch.int32,
        ),
        "weight.q_scale": torch.tensor([8], dtype=torch.int32),
        "weight.q_invperm": torch.tensor([1, 0, 3, 2], dtype=torch.int32),
        "weight.q_scale_max": torch.tensor([100], dtype=torch.float16),
        "weight.q_groups": torch.tensor([4], dtype=torch.int16),
    },
    "test_get_weights_row_marlin": {
        "weight.B": torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
        "weight.s": torch.tensor([[0.5], [0.25]], dtype=torch.float16),
    },
    "test_get_multi_weights_col_marlin": {
        "weight.B": torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
        "weight.s": torch.tensor([[0.5], [0.25]], dtype=torch.float16),
    },
    "test_get_weights_col_packed_marlin": {
        "weight.B": torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
        "weight.s": torch.tensor([[0.5], [0.25]], dtype=torch.float16),
    },
}


class MockSlice:
    def __init__(self, tensor):
        self.tensor = tensor

    def get_shape(self):
        return self.tensor.shape

    def __getitem__(self, idx):
        return self.tensor[idx]


def mock_get_slice(tensor_name, filename):
    tensor = dummy_file_system[filename][tensor_name]
    return MockSlice(tensor)


def mock_handle(filename, device, dtype):
    return SimpleNamespace(
        get_slice=lambda tensor_name: mock_get_slice(tensor_name, filename)
    )


class MockSafeOpen:
    def __init__(self, filename, framework, dummy_fs):
        self.filename = filename
        self.framework = framework
        self.dummy_fs = dummy_fs

    def keys(self):
        return list(self.dummy_fs[self.filename].keys())

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        pass


class MockWeights(Weights):
    def __init__(
        self,
        filenames: List[Union[Path, str]],
        device,
        dtype,
        process_group,
        dummy_fs,
        aliases: Optional[Dict[str, List[str]]] = None,
        prefix: Optional[str] = None,
        weights_loader: Optional[WeightsLoader] = None,
    ):
        routing = {}
        self.dummy_fs = dummy_fs
        for filename in filenames:
            with MockSafeOpen(filename, framework="pytorch", dummy_fs=dummy_fs) as f:
                for k in f.keys():
                    if k in routing:
                        raise RuntimeError(
                            f"Key {k} was found in multiple files: {filename} and {routing[k]}"
                        )
                    routing[k] = filename
        if aliases is None:
            aliases = {}
        self.aliases = aliases
        self.routing = routing
        self.device = device
        self.dtype = dtype
        self.process_group = process_group
        self.prefix = prefix
        self.weights_loader = (
            # We don't need to get linear layers, so just wrap raw tensors.
            DefaultWeightsLoader(lambda x: x)
            if weights_loader is None
            else weights_loader
        )
        self._handles = {}

    def _get_handle(self, filename: Union[Path, str]):
        if filename in self._handles:
            return self._handles[filename]
        else:
            handle = mock_handle(filename, self.device, self.dtype)
            self._handles[filename] = handle
            return handle

    def get_shape(self, tensor_name: str):
        filename, _ = self.get_filename(tensor_name)
        handle = self._get_handle(filename)
        return handle.get_slice(tensor_name).get_shape()

    def get_tensor(self, tensor_name: str):
        filename, _ = self.get_filename(tensor_name)
        handle = self._get_handle(filename)
        return handle.get_slice(tensor_name).tensor


dummy_process_group = SimpleNamespace(rank=lambda: 0, size=lambda: 1)


def test_weights():
    weights = MockWeights(
        [
            "test_weights",
            "test_weights_2",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
    )
    assert weights.get_shape("layer.0.weight") == (2, 2)
    assert weights.get_tensor("layer.1337.weight").shape == (2, 4)


def test_get_tensor():
    weights = MockWeights(
        [
            "test_weights",
            "test_weights_2",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
    )
    assert torch.allclose(
        weights.get_tensor("layer.0.weight"),
        torch.tensor(
            [
                [1, 2],
                [3, 4],
            ],
            dtype=torch.float32,
        ),
    )
    assert torch.allclose(
        weights.get_tensor("layer.1337.weight"),
        torch.tensor(
            [
                [1, 2, 3, 4],
                [5, 6, 7, 8],
            ],
            dtype=torch.float32,
        ),
    )


def test_get_weights_col_packed():

    weights = MockWeights(
        [
            "test_get_weights_col_packed",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
    )

    prefix = "weight"
    block_sizes = 1

    w = weights.get_weights_col_packed(
        prefix=prefix,
        block_sizes=block_sizes,
    )

    assert torch.allclose(
        w,
        torch.tensor(
            [
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
            ],
            dtype=torch.float32,
        ),
    )


def test_get_weights_col_packed_block_size():

    weights = MockWeights(
        [
            "test_get_weights_col_packed",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
    )

    prefix = "weight"
    block_sizes = 2

    w = weights.get_weights_col_packed(
        prefix=prefix,
        block_sizes=block_sizes,
    )

    assert torch.allclose(
        w,
        torch.tensor(
            [
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
            ],
            dtype=torch.float32,
        ),
    )


def test_get_weights_col_packed_block_size_arr():

    weights = MockWeights(
        [
            "test_get_weights_col_packed",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
    )

    prefix = "weight"
    block_sizes = [1, 1]

    w = weights.get_weights_col_packed(
        prefix=prefix,
        block_sizes=block_sizes,
    )

    assert torch.allclose(
        w,
        torch.tensor(
            [
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
            ],
            dtype=torch.float32,
        ),
    )


def test_get_multi_weights_col():
    weights = MockWeights(
        [
            "test_get_multi_weights_col",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
    )

    prefixes = ["weight", "weight"]

    w = weights.get_multi_weights_col(
        prefixes=prefixes,
        dim=0,
    )

    assert torch.allclose(
        w,
        torch.tensor(
            [
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
            ],
            dtype=torch.float32,
        ),
    )


def test_get_weights_row():
    weights = MockWeights(
        [
            "test_get_weights_row",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
    )

    prefix = "weight"

    w = weights.get_weights_row(
        prefix=prefix,
    )

    assert torch.allclose(
        w,
        torch.tensor(
            [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]],
            dtype=torch.float32,
        ),
    )


# test_get_weights_col


def test_get_weights_col_awq(gptq_weights_loader_awq):
    weights = MockWeights(
        [
            "test_get_weights_col_gptq",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
        weights_loader=gptq_weights_loader_awq,
    )

    prefix = "weight"

    w = weights.get_weights_col(
        prefix=prefix,
    )

    expected_weight = GPTQWeight(
        qweight=torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]]),
        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
        scales=torch.tensor(
            [[100.0, 100.0], [100.0, 100.0]],
            dtype=torch.float16,
        ),
        g_idx=None,
        bits=8.0,
        groupsize=2.0,
        use_awq_kernel=True,
        use_exllama=False,
    )

    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
    assert w.g_idx == expected_weight.g_idx, "g_idx mismatch"
    assert w.bits == expected_weight.bits, "bits mismatch"
    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"


def test_get_weights_col_gtpq(gptq_weights_loader):
    weights = MockWeights(
        [
            "test_get_weights_col_gptq",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
        weights_loader=gptq_weights_loader,
    )

    prefix = "weight"

    w = weights.get_weights_col(
        prefix=prefix,
    )

    expected_weight = GPTQWeight(
        qweight=torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]]),
        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
        g_idx=torch.tensor([0, 1, 0, 1], dtype=torch.int32),
        bits=8.0,
        groupsize=2.0,
        use_awq_kernel=False,
        use_exllama=False,
    )

    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
    assert torch.allclose(w.g_idx, expected_weight.g_idx), "g_idx mismatch"
    assert w.bits == expected_weight.bits, "bits mismatch"
    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"


def test_get_weights_col_exl2():
    weights = MockWeights(
        [
            "test_get_weights_col_exl2",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
        weights_loader=Exl2WeightsLoader(),
    )

    prefix = "weight"

    w = weights.get_weights_col(
        prefix=prefix,
    )

    scaled_scale_max = 0.3906 * 256
    expected_weight = Exl2Weight(
        q_weight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
        q_scale=torch.tensor([8], dtype=torch.int32),
        q_invperm=torch.tensor([1, 0, 3, 2], dtype=torch.int16),
        q_scale_max=torch.tensor([scaled_scale_max], dtype=torch.float16),
        q_groups=torch.tensor([4], dtype=torch.int16),
    )

    assert torch.allclose(w.q_weight, expected_weight.q_weight), "q_weight mismatch"
    assert torch.allclose(w.q_scale, expected_weight.q_scale), "q_scale mismatch"
    assert torch.allclose(w.q_invperm, expected_weight.q_invperm), "q_invperm mismatch"
    assert torch.allclose(
        w.q_scale_max, expected_weight.q_scale_max
    ), "q_scale_max mismatch"
    assert torch.allclose(w.q_groups, expected_weight.q_groups), "q_groups mismatch"


def test_get_weights_col_marlin(marlin_weights_loader):
    weights = MockWeights(
        [
            "test_get_weights_col_marlin",
        ],
        device="cpu",
        dtype=torch.float16,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
        weights_loader=marlin_weights_loader,
    )

    prefix = "weight"

    w = weights.get_weights_col(
        prefix=prefix,
    )

    expected_weight = MarlinWeight(
        B=torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
        s=torch.tensor([[0.5000], [0.2500]], dtype=torch.float16),
    )

    assert torch.allclose(w.B, expected_weight.B), "B mismatch"
    assert torch.allclose(w.s, expected_weight.s), "s mismatch"


# test_get_weights_col_packed


def test_get_weights_col_packed_awq(gptq_weights_loader_awq):
    weights = MockWeights(
        [
            "test_get_weights_col_packed_gptq",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
        weights_loader=gptq_weights_loader_awq,
    )

    prefix = "weight"
    block_sizes = 1

    w = weights.get_weights_col_packed(
        prefix=prefix,
        block_sizes=block_sizes,
    )

    expected_weight = GPTQWeight(
        qweight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
        g_idx=None,
        bits=8.0,
        groupsize=2.0,
        use_awq_kernel=True,
        use_exllama=False,
    )

    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
    assert w.g_idx == expected_weight.g_idx, "g_idx mismatch"
    assert w.bits == expected_weight.bits, "bits mismatch"
    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"


@pytest.mark.skip(reason="Review expected functionality")
def test_get_weights_col_packed_exl2():
    weights = MockWeights(
        [
            "test_get_weights_col_packed_exl2",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
        weights_loader=Exl2WeightsLoader(),
    )

    prefix = "weight"
    block_sizes = 1

    w = weights.get_weights_col_packed(
        prefix=prefix,
        block_sizes=block_sizes,
    )

    scaled_scale_max = 0.3906 * 256
    expected_weight = Exl2Weight(
        q_weight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
        q_scale=torch.tensor([8], dtype=torch.int32),
        q_invperm=torch.tensor([1], dtype=torch.int16),
        q_scale_max=torch.tensor([scaled_scale_max], dtype=torch.float16),
        q_groups=torch.tensor([4], dtype=torch.int16),
    )

    assert torch.allclose(w.q_weight, expected_weight.q_weight), "q_weight mismatch"
    assert torch.allclose(w.q_scale, expected_weight.q_scale), "q_scale mismatch"
    assert torch.allclose(w.q_invperm, expected_weight.q_invperm), "q_invperm mismatch"
    assert torch.allclose(
        w.q_scale_max, expected_weight.q_scale_max
    ), "q_scale_max mismatch"
    assert torch.allclose(w.q_groups, expected_weight.q_groups), "q_groups mismatch"


def test_get_weights_col_packed_gptq(gptq_weights_loader):
    weights = MockWeights(
        [
            "test_get_weights_col_packed_gptq",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
        weights_loader=gptq_weights_loader,
    )

    prefixes = ["weight"]

    w = weights.get_multi_weights_col(
        prefixes=prefixes,
        dim=0,
    )

    expected_weight = GPTQWeight(
        qweight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
        g_idx=torch.tensor([0, 1, 0, 1], dtype=torch.int32),
        bits=8.0,
        groupsize=2.0,
        use_awq_kernel=False,
        use_exllama=False,
    )

    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
    assert torch.allclose(w.g_idx, expected_weight.g_idx), "g_idx mismatch"
    assert w.bits == expected_weight.bits, "bits mismatch"
    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"


def test_get_weights_col_packed_marlin(marlin_weights_loader):
    weights = MockWeights(
        [
            "test_get_weights_col_packed_marlin",
        ],
        device="cpu",
        dtype=torch.float16,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
        weights_loader=marlin_weights_loader,
    )

    prefix = "weight"

    w = weights.get_multi_weights_col(
        prefixes=[prefix],
        dim=0,
    )

    expected_weight = MarlinWeight(
        B=torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
        s=torch.tensor([[0.5000], [0.2500]], dtype=torch.float16),
    )

    print(expected_weight)

    assert torch.allclose(w.B, expected_weight.B), "B mismatch"
    assert torch.allclose(w.s, expected_weight.s), "s mismatch"


# test_get_multi_weights_col


def test_get_multi_weights_col_awq(gptq_weights_loader_awq):
    weights = MockWeights(
        [
            "test_get_multi_weights_col_gptq",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
        weights_loader=gptq_weights_loader_awq,
    )

    prefixes = ["weight"]

    w = weights.get_multi_weights_col(
        prefixes=prefixes,
        dim=0,
    )

    expected_weight = GPTQWeight(
        qweight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
        g_idx=None,
        bits=8.0,
        groupsize=2.0,
        use_awq_kernel=True,
        use_exllama=False,
    )

    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
    assert w.g_idx == expected_weight.g_idx, "g_idx mismatch"
    assert w.bits == expected_weight.bits, "bits mismatch"
    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"


def test_get_multi_weights_col_exl2():
    weights = MockWeights(
        [
            "test_get_multi_weights_col_exl2",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
        weights_loader=Exl2WeightsLoader(),
    )

    prefix = "weight"

    try:
        w = weights.get_multi_weights_col(
            prefixes=[prefix],
            dim=0,
        )
    except ValueError as e:
        assert e.args[0] == "get_multi_weights_col is not supported for exl2"


def test_get_multi_weights_col_gptq(gptq_weights_loader):
    weights = MockWeights(
        [
            "test_get_multi_weights_col_gptq",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
        weights_loader=gptq_weights_loader,
    )

    prefixes = ["weight"]

    w = weights.get_multi_weights_col(
        prefixes=prefixes,
        dim=0,
    )

    expected_weight = GPTQWeight(
        qweight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
        g_idx=torch.tensor([0, 1, 0, 1], dtype=torch.int32),
        bits=8.0,
        groupsize=2.0,
        use_awq_kernel=False,
        use_exllama=False,
    )

    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
    assert torch.allclose(w.g_idx, expected_weight.g_idx), "g_idx mismatch"
    assert w.bits == expected_weight.bits, "bits mismatch"
    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"


def test_get_multi_weights_col_marlin(marlin_weights_loader):
    weights = MockWeights(
        [
            "test_get_multi_weights_col_marlin",
        ],
        device="cpu",
        dtype=torch.float16,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
        weights_loader=marlin_weights_loader,
    )

    prefix = "weight"

    w = weights.get_multi_weights_col(
        prefixes=[prefix],
        dim=0,
    )

    expected_weight = MarlinWeight(
        B=torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
        s=torch.tensor([[0.5000], [0.2500]], dtype=torch.float16),
    )

    assert torch.allclose(w.B, expected_weight.B), "B mismatch"
    assert torch.allclose(w.s, expected_weight.s), "s mismatch"


# test_get_weights_row


def test_get_weights_row_awq(gptq_weights_loader_awq):
    weights = MockWeights(
        [
            "test_get_weights_row_gptq",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
        weights_loader=gptq_weights_loader_awq,
    )

    prefix = "weight"

    w = weights.get_weights_row(
        prefix=prefix,
    )

    expected_weight = GPTQWeight(
        qweight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
        g_idx=None,
        bits=8.0,
        groupsize=2.0,
        use_awq_kernel=True,
        use_exllama=False,
    )

    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
    assert w.g_idx == expected_weight.g_idx, "g_idx mismatch"
    assert w.bits == expected_weight.bits, "bits mismatch"
    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"


def test_get_weights_row_exl2():
    weights = MockWeights(
        [
            "test_get_weights_row_exl2",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
        weights_loader=Exl2WeightsLoader(),
    )

    prefix = "weight"

    w = weights.get_weights_row(
        prefix=prefix,
    )
    print(w)

    scaled_scale_max = 0.3906 * 256
    expected_weight = Exl2Weight(
        q_weight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
        q_scale=torch.tensor([8], dtype=torch.int32),
        q_invperm=torch.tensor([1, 0, 3, 2], dtype=torch.int16),
        q_scale_max=torch.tensor([scaled_scale_max], dtype=torch.float16),
        q_groups=torch.tensor([4], dtype=torch.int16),
    )

    assert torch.allclose(w.q_weight, expected_weight.q_weight), "q_weight mismatch"
    assert torch.allclose(w.q_scale, expected_weight.q_scale), "q_scale mismatch"
    assert torch.allclose(w.q_invperm, expected_weight.q_invperm), "q_invperm mismatch"
    assert torch.allclose(
        w.q_scale_max, expected_weight.q_scale_max
    ), "q_scale_max mismatch"
    assert torch.allclose(w.q_groups, expected_weight.q_groups), "q_groups mismatch"


def test_get_weights_row_gptq(gptq_weights_loader):
    weights = MockWeights(
        [
            "test_get_weights_row_gptq",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
        weights_loader=gptq_weights_loader,
    )

    prefix = "weight"

    w = weights.get_weights_row(
        prefix=prefix,
    )

    expected_weight = GPTQWeight(
        qweight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
        g_idx=torch.tensor([0, 1, 0, 1], dtype=torch.int32),
        bits=8.0,
        groupsize=2.0,
        use_awq_kernel=False,
        use_exllama=False,
    )

    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
    assert torch.allclose(w.g_idx, expected_weight.g_idx), "g_idx mismatch"
    assert w.bits == expected_weight.bits, "bits mismatch"
    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"


def test_get_weights_row_marlin(marlin_weights_loader):
    weights = MockWeights(
        [
            "test_get_weights_row_marlin",
        ],
        device="cpu",
        dtype=torch.float16,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
        weights_loader=marlin_weights_loader,
    )

    prefix = "weight"

    w = weights.get_weights_row(
        prefix=prefix,
    )

    expected_weight = MarlinWeight(
        B=torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
        s=torch.tensor([[0.5000], [0.2500]], dtype=torch.float16),
    )

    assert torch.allclose(w.B, expected_weight.B), "B mismatch"
    assert torch.allclose(w.s, expected_weight.s), "s mismatch"