From 1c84a30fe6353c8691f4809f64ec77c2cfeeb246 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 30 Sep 2024 19:40:25 +0200
Subject: [PATCH] MoE Marlin: support `desc_act` for `groupsize != -1` (#2590)

This change uses the updated Marlin MoE kernel from vLLM to support
MoE with activation sorting and groups.
---
 flake.lock                                             |  7 ++++---
 flake.nix                                              |  2 +-
 server/text_generation_server/layers/marlin/gptq.py    |  3 +--
 server/text_generation_server/layers/moe/__init__.py   |  3 ---
 .../text_generation_server/layers/moe/gptq_marlin.py   | 10 ----------
 5 files changed, 6 insertions(+), 19 deletions(-)

diff --git a/flake.lock b/flake.lock
index fd6f3a54..e6361fda 100644
--- a/flake.lock
+++ b/flake.lock
@@ -978,15 +978,16 @@
         "nixpkgs": "nixpkgs_6"
       },
       "locked": {
-        "lastModified": 1727687740,
-        "narHash": "sha256-ssoGLmRoyQ+8d5utr5fwLox+/eQ789iVtUj1xrukIC0=",
+        "lastModified": 1727710820,
+        "narHash": "sha256-BuSafCxoFQhkp7lnvNtpquxSK43rIbnouL2HypIUC+o=",
         "owner": "danieldk",
         "repo": "tgi-nix",
-        "rev": "5e884ba50c26a7c93337bc0876f69da961c10374",
+        "rev": "4f4dc4b85dd856fd7904e8e3e486a2ff153584a2",
         "type": "github"
       },
       "original": {
         "owner": "danieldk",
+        "ref": "moe-kernels-0.5.0",
         "repo": "tgi-nix",
         "type": "github"
       }
diff --git a/flake.nix b/flake.nix
index 42fb3c6a..be19e908 100644
--- a/flake.nix
+++ b/flake.nix
@@ -5,7 +5,7 @@
       inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
     };
     nix-filter.url = "github:numtide/nix-filter";
-    tgi-nix.url = "github:danieldk/tgi-nix";
+    tgi-nix.url = "github:danieldk/tgi-nix/moe-kernels-0.5.0";
     nixpkgs.follows = "tgi-nix/nixpkgs";
     flake-utils.url = "github:numtide/flake-utils";
     rust-overlay = {
diff --git a/server/text_generation_server/layers/marlin/gptq.py b/server/text_generation_server/layers/marlin/gptq.py
index c7663b60..0a785d94 100644
--- a/server/text_generation_server/layers/marlin/gptq.py
+++ b/server/text_generation_server/layers/marlin/gptq.py
@@ -109,7 +109,6 @@ class GPTQMarlinWeightsLoader(WeightsLoader):
         prefix: str,
         block_sizes: Union[int, List[int]],
     ):
-
         try:
             qweight = weights.get_packed_sharded(
                 f"{prefix}.qweight", dim=1, block_sizes=block_sizes
@@ -352,7 +351,7 @@ def repack_gptq_for_marlin(
 
     scales = permute_scales(scales)
 
-    is_full_k = not (desc_act and sharded_infeatures)
+    is_full_k = not (desc_act and groupsize != -1 and sharded_infeatures)
 
     return GPTQMarlinWeight(
         qweight=repacked,
diff --git a/server/text_generation_server/layers/moe/__init__.py b/server/text_generation_server/layers/moe/__init__.py
index ca71ebab..2c46ca02 100644
--- a/server/text_generation_server/layers/moe/__init__.py
+++ b/server/text_generation_server/layers/moe/__init__.py
@@ -249,12 +249,9 @@ class SparseMoELayer(nn.Module):
             or (
                 isinstance(weights.loader, GPTQMarlinWeightsLoader)
                 and can_use_marlin_moe_gemm(
-                    desc_act=weights.loader.desc_act,
-                    groupsize=weights.loader.groupsize,
                     quant_method=weights.loader.quant_method,
                     quantize=weights.loader.quantize,
                     sym=weights.loader.sym,
-                    use_tp=weights.process_group.size() > 1,
                 )
             )
         )
diff --git a/server/text_generation_server/layers/moe/gptq_marlin.py b/server/text_generation_server/layers/moe/gptq_marlin.py
index 3fc06cb2..3217cdc2 100644
--- a/server/text_generation_server/layers/moe/gptq_marlin.py
+++ b/server/text_generation_server/layers/moe/gptq_marlin.py
@@ -26,12 +26,9 @@ except Exception:
 
 def can_use_marlin_moe_gemm(
     *,
-    desc_act: bool,
-    groupsize: int,
     quant_method: str,
     quantize: str,
     sym: bool,
-    use_tp: bool,
 ):
     return (
         SYSTEM == "cuda"
@@ -40,16 +37,9 @@ def can_use_marlin_moe_gemm(
         and quantize == "gptq"
         and quant_method == "gptq"
         and sym
-        and is_full_k(desc_act, groupsize, use_tp)
     )
 
 
-def is_full_k(desc_act: bool, groupsize: int, use_tp: bool):
-    if groupsize == -1:
-        return True
-    return not (desc_act and use_tp)
-
-
 @dataclass
 class GPTQMarlinMoEWeight:
     qweight: torch.Tensor