Add initial support for compressed-tensors checkpoints (#2732)

compressed-tensors is a safetensors extension for sparse, quantized tensors. The format is more powerful than earlier AWQ/GPTQ/FP8 quantization, because - Different quantizer configurations can be used for different targets. - The format can specify input/output quantizers in addition to weight quantizers. - Configurable exclusions for quantization. This change adds a dependency on the `compressed-tensors` package for its configuration parsing and layer matching functionality. The following types of quantization are supported in this PR: - W8A16 and W4A16 INT using GPTQ-Marlin kernels. - W8A8 and W8A16 FP using FP8-Marlin and cutlass kernels. Support for other quantization types will be added in subsequent PRs.
2024-11-10 13:54:07 +01:00 · 2024-11-10 13:54:07 +01:00 · a785000842
parent 97f7a22f0b
commit a785000842
28 changed files with 2052 additions and 78 deletions
--- a/2
+++ b/2
@ -247,7 +247,7 @@ COPY server/Makefile server/Makefile
 RUN cd server && \
    make gen-server && \
    pip install -r requirements_cuda.txt && \
-    pip install ".[bnb, accelerate, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
+    pip install ".[bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
    pip install nvidia-nccl-cu12==2.22.3

 ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
--- a/2
+++ b/2
@ -296,7 +296,7 @@ COPY server/Makefile server/Makefile
 RUN cd server && \
    make gen-server && \
    pip install -r requirements_rocm.txt && \
-    pip install ".[accelerate, peft, outlines]" --no-cache-dir
+    pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir

 # Install benchmarker
 COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
--- a/2
+++ b/2
@ -102,7 +102,7 @@ COPY server/Makefile server/Makefile
 RUN cd server && \
    make gen-server && \
    pip install -r requirements_intel.txt && \
-    pip install ".[accelerate, peft, outlines]" --no-cache-dir
+    pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir

 ENV CCL_ROOT=/opt/intel/oneapi/ccl/latest
 ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest
--- a/docs/source/reference/launcher.md
+++ b/docs/source/reference/launcher.md
@ -63,6 +63,7 @@ Options:

          Possible values:
          - awq:                4 bit quantization. Requires a specific AWQ quantized model: <https://hf.co/models?search=awq>. Should replace GPTQ models wherever possible because of the better latency
+          - compressed-tensors: Compressed tensors, which can be a mixture of different quantization methods
          - eetq:               8 bit quantization, doesn't require specific model. Should be a drop-in replacement to bitsandbytes with much better performance. Kernels are from <https://github.com/NetEase-FuXi/EETQ.git>
          - exl2:               Variable bit quantization. Requires a specific EXL2 quantized model: <https://hf.co/models?search=exl2>. Requires exllama2 kernels and does not support tensor parallelism (num_shard > 1)
          - gptq:               4 bit quantization. Requires a specific GTPQ quantized model: <https://hf.co/models?search=gptq>. text-generation-inference will use exllama (faster) kernels wherever possible, and use triton kernel (wider support) when it's not. AWQ has faster kernels
--- a/flake.lock
+++ b/flake.lock
@ -978,15 +978,16 @@
        "nixpkgs": "nixpkgs_6"
      },
      "locked": {
-        "lastModified": 1730724647,
-        "narHash": "sha256-SVv+50CGaCoU4zZwsg6ZAaOi/D5QJBL1P2SIB+3CEf4=",
+        "lastModified": 1730795478,
+        "narHash": "sha256-xpkXDKnkhXO4F6Ea3reHmqwXXRzQe2PsxdRQFPCViWs=",
        "owner": "huggingface",
        "repo": "text-generation-inference-nix",
-        "rev": "1512898a1e5ad9eff025205fa9c4d33a44506cf3",
+        "rev": "b7f6c07867d94d6e55f5352573a6b3dad1c88e56",
        "type": "github"
      },
      "original": {
        "owner": "huggingface",
+        "ref": "compressed-tensors-0.7.1",
        "repo": "text-generation-inference-nix",
        "type": "github"
      }
--- a/flake.nix
+++ b/flake.nix
@ -5,7 +5,7 @@
      inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
    };
    nix-filter.url = "github:numtide/nix-filter";
-    tgi-nix.url = "github:huggingface/text-generation-inference-nix";
+    tgi-nix.url = "github:huggingface/text-generation-inference-nix/compressed-tensors-0.7.1";
    nixpkgs.follows = "tgi-nix/nixpkgs";
    flake-utils.url = "github:numtide/flake-utils";
    rust-overlay = {
--- a/integration-tests/models/snapshots/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an.json
@ -0,0 +1,104 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 128000,
+        "logprob": null,
+        "text": "<|begin_of_text|>"
+      },
+      {
+        "id": 3923,
+        "logprob": -7.609375,
+        "text": "What"
+      },
+      {
+        "id": 374,
+        "logprob": -0.92529297,
+        "text": " is"
+      },
+      {
+        "id": 5655,
+        "logprob": -10.0,
+        "text": " deep"
+      },
+      {
+        "id": 6975,
+        "logprob": -0.94628906,
+        "text": " learning"
+      },
+      {
+        "id": 30,
+        "logprob": -2.9042969,
+        "text": "?"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 18682,
+        "logprob": -0.8769531,
+        "special": false,
+        "text": " Deep"
+      },
+      {
+        "id": 6975,
+        "logprob": -0.0076942444,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 374,
+        "logprob": -0.25073242,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 264,
+        "logprob": -0.097595215,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 955,
+        "logprob": -0.921875,
+        "special": false,
+        "text": " type"
+      },
+      {
+        "id": 315,
+        "logprob": -0.00027918816,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 21075,
+        "logprob": -0.5527344,
+        "special": false,
+        "text": " artificial"
+      },
+      {
+        "id": 11478,
+        "logprob": -0.042541504,
+        "special": false,
+        "text": " intelligence"
+      },
+      {
+        "id": 320,
+        "logprob": -0.38891602,
+        "special": false,
+        "text": " ("
+      },
+      {
+        "id": 15836,
+        "logprob": -0.0011043549,
+        "special": false,
+        "text": "AI"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": " Deep learning is a type of artificial intelligence (AI"
+}
--- a/integration-tests/models/snapshots/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_all_params.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_all_params.json
@ -0,0 +1,99 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 128000,
+        "logprob": null,
+        "text": "<|begin_of_text|>"
+      },
+      {
+        "id": 3923,
+        "logprob": -7.609375,
+        "text": "What"
+      },
+      {
+        "id": 374,
+        "logprob": -0.92529297,
+        "text": " is"
+      },
+      {
+        "id": 5655,
+        "logprob": -10.0,
+        "text": " deep"
+      },
+      {
+        "id": 6975,
+        "logprob": -0.94628906,
+        "text": " learning"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 5380,
+        "logprob": -0.23840332,
+        "special": false,
+        "text": "?\n"
+      },
+      {
+        "id": 34564,
+        "logprob": 0.0,
+        "special": false,
+        "text": "Deep"
+      },
+      {
+        "id": 6975,
+        "logprob": 0.0,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 11,
+        "logprob": 0.0,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 1101,
+        "logprob": -1.2011719,
+        "special": false,
+        "text": " also"
+      },
+      {
+        "id": 3967,
+        "logprob": 0.0,
+        "special": false,
+        "text": " known"
+      },
+      {
+        "id": 439,
+        "logprob": 0.0,
+        "special": false,
+        "text": " as"
+      },
+      {
+        "id": 30828,
+        "logprob": 0.0,
+        "special": false,
+        "text": " neural"
+      },
+      {
+        "id": 4009,
+        "logprob": -0.6777344,
+        "special": false,
+        "text": " network"
+      },
+      {
+        "id": 477,
+        "logprob": 0.0,
+        "special": false,
+        "text": " or"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "What is deep learning?\nDeep learning, also known as neural network or"
+}
--- a/integration-tests/models/snapshots/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_load.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_load.json
@ -0,0 +1,418 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 128000,
+          "logprob": null,
+          "text": "<|begin_of_text|>"
+        },
+        {
+          "id": 3923,
+          "logprob": -7.609375,
+          "text": "What"
+        },
+        {
+          "id": 374,
+          "logprob": -0.92529297,
+          "text": " is"
+        },
+        {
+          "id": 5655,
+          "logprob": -10.0,
+          "text": " deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.94628906,
+          "text": " learning"
+        },
+        {
+          "id": 30,
+          "logprob": -2.9042969,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 18682,
+          "logprob": -0.8769531,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.0076942444,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 374,
+          "logprob": -0.25146484,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 264,
+          "logprob": -0.097595215,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 955,
+          "logprob": -0.9248047,
+          "special": false,
+          "text": " type"
+        },
+        {
+          "id": 315,
+          "logprob": -0.00027513504,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 21075,
+          "logprob": -0.5527344,
+          "special": false,
+          "text": " artificial"
+        },
+        {
+          "id": 11478,
+          "logprob": -0.043151855,
+          "special": false,
+          "text": " intelligence"
+        },
+        {
+          "id": 320,
+          "logprob": -0.3840332,
+          "special": false,
+          "text": " ("
+        },
+        {
+          "id": 15836,
+          "logprob": -0.0011043549,
+          "special": false,
+          "text": "AI"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " Deep learning is a type of artificial intelligence (AI"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 128000,
+          "logprob": null,
+          "text": "<|begin_of_text|>"
+        },
+        {
+          "id": 3923,
+          "logprob": -7.6054688,
+          "text": "What"
+        },
+        {
+          "id": 374,
+          "logprob": -0.92089844,
+          "text": " is"
+        },
+        {
+          "id": 5655,
+          "logprob": -10.0,
+          "text": " deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.94433594,
+          "text": " learning"
+        },
+        {
+          "id": 30,
+          "logprob": -2.90625,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 18682,
+          "logprob": -0.875,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.007698059,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 374,
+          "logprob": -0.25268555,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 264,
+          "logprob": -0.09753418,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 955,
+          "logprob": -0.92529297,
+          "special": false,
+          "text": " type"
+        },
+        {
+          "id": 315,
+          "logprob": -0.00027942657,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 21075,
+          "logprob": -0.5527344,
+          "special": false,
+          "text": " artificial"
+        },
+        {
+          "id": 11478,
+          "logprob": -0.042541504,
+          "special": false,
+          "text": " intelligence"
+        },
+        {
+          "id": 320,
+          "logprob": -0.3840332,
+          "special": false,
+          "text": " ("
+        },
+        {
+          "id": 15836,
+          "logprob": -0.0011053085,
+          "special": false,
+          "text": "AI"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " Deep learning is a type of artificial intelligence (AI"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 128000,
+          "logprob": null,
+          "text": "<|begin_of_text|>"
+        },
+        {
+          "id": 3923,
+          "logprob": -7.6054688,
+          "text": "What"
+        },
+        {
+          "id": 374,
+          "logprob": -0.92089844,
+          "text": " is"
+        },
+        {
+          "id": 5655,
+          "logprob": -10.0,
+          "text": " deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.94433594,
+          "text": " learning"
+        },
+        {
+          "id": 30,
+          "logprob": -2.90625,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 18682,
+          "logprob": -0.875,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.007698059,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 374,
+          "logprob": -0.25268555,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 264,
+          "logprob": -0.09753418,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 955,
+          "logprob": -0.92529297,
+          "special": false,
+          "text": " type"
+        },
+        {
+          "id": 315,
+          "logprob": -0.00027942657,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 21075,
+          "logprob": -0.5527344,
+          "special": false,
+          "text": " artificial"
+        },
+        {
+          "id": 11478,
+          "logprob": -0.042541504,
+          "special": false,
+          "text": " intelligence"
+        },
+        {
+          "id": 320,
+          "logprob": -0.3840332,
+          "special": false,
+          "text": " ("
+        },
+        {
+          "id": 15836,
+          "logprob": -0.0011053085,
+          "special": false,
+          "text": "AI"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " Deep learning is a type of artificial intelligence (AI"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 128000,
+          "logprob": null,
+          "text": "<|begin_of_text|>"
+        },
+        {
+          "id": 3923,
+          "logprob": -7.6054688,
+          "text": "What"
+        },
+        {
+          "id": 374,
+          "logprob": -0.92089844,
+          "text": " is"
+        },
+        {
+          "id": 5655,
+          "logprob": -10.0,
+          "text": " deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.94433594,
+          "text": " learning"
+        },
+        {
+          "id": 30,
+          "logprob": -2.90625,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 18682,
+          "logprob": -0.875,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.007698059,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 374,
+          "logprob": -0.25268555,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 264,
+          "logprob": -0.09753418,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 955,
+          "logprob": -0.92529297,
+          "special": false,
+          "text": " type"
+        },
+        {
+          "id": 315,
+          "logprob": -0.00027942657,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 21075,
+          "logprob": -0.5527344,
+          "special": false,
+          "text": " artificial"
+        },
+        {
+          "id": 11478,
+          "logprob": -0.042541504,
+          "special": false,
+          "text": " intelligence"
+        },
+        {
+          "id": 320,
+          "logprob": -0.3840332,
+          "special": false,
+          "text": " ("
+        },
+        {
+          "id": 15836,
+          "logprob": -0.0011053085,
+          "special": false,
+          "text": "AI"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " Deep learning is a type of artificial intelligence (AI"
+  }
+]
--- a/integration-tests/models/snapshots/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16.json
@ -0,0 +1,104 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 2,
+        "logprob": null,
+        "text": "<bos>"
+      },
+      {
+        "id": 1841,
+        "logprob": -5.46875,
+        "text": "What"
+      },
+      {
+        "id": 603,
+        "logprob": -0.69140625,
+        "text": " is"
+      },
+      {
+        "id": 5271,
+        "logprob": -12.0,
+        "text": " deep"
+      },
+      {
+        "id": 6044,
+        "logprob": -0.32226562,
+        "text": " learning"
+      },
+      {
+        "id": 235336,
+        "logprob": -0.33203125,
+        "text": "?"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 109,
+        "logprob": -0.24707031,
+        "special": false,
+        "text": "\n\n"
+      },
+      {
+        "id": 26843,
+        "logprob": -0.14550781,
+        "special": false,
+        "text": "Deep"
+      },
+      {
+        "id": 6044,
+        "logprob": -0.038330078,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 603,
+        "logprob": -0.029907227,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 476,
+        "logprob": -0.020996094,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 38397,
+        "logprob": -0.828125,
+        "special": false,
+        "text": " subset"
+      },
+      {
+        "id": 576,
+        "logprob": -0.00049209595,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 6479,
+        "logprob": -0.057373047,
+        "special": false,
+        "text": " machine"
+      },
+      {
+        "id": 6044,
+        "logprob": -0.000207901,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 674,
+        "logprob": -0.15429688,
+        "special": false,
+        "text": " that"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "\n\nDeep learning is a subset of machine learning that"
+}
--- a/integration-tests/models/snapshots/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_all_params.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_all_params.json
@ -0,0 +1,99 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 2,
+        "logprob": null,
+        "text": "<bos>"
+      },
+      {
+        "id": 1841,
+        "logprob": -5.46875,
+        "text": "What"
+      },
+      {
+        "id": 603,
+        "logprob": -0.69140625,
+        "text": " is"
+      },
+      {
+        "id": 5271,
+        "logprob": -12.0,
+        "text": " deep"
+      },
+      {
+        "id": 6044,
+        "logprob": -0.32226562,
+        "text": " learning"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 235336,
+        "logprob": 0.0,
+        "special": false,
+        "text": "?"
+      },
+      {
+        "id": 109,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n\n"
+      },
+      {
+        "id": 26843,
+        "logprob": 0.0,
+        "special": false,
+        "text": "Deep"
+      },
+      {
+        "id": 14715,
+        "logprob": -0.38671875,
+        "special": false,
+        "text": " Learning"
+      },
+      {
+        "id": 603,
+        "logprob": 0.0,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 476,
+        "logprob": 0.0,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 38397,
+        "logprob": -0.12695312,
+        "special": false,
+        "text": " subset"
+      },
+      {
+        "id": 576,
+        "logprob": 0.0,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 6479,
+        "logprob": 0.0,
+        "special": false,
+        "text": " machine"
+      },
+      {
+        "id": 6044,
+        "logprob": 0.0,
+        "special": false,
+        "text": " learning"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "What is deep learning?\n\nDeep Learning is a subset of machine learning"
+}
--- a/integration-tests/models/snapshots/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_load.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_load.json
@ -0,0 +1,418 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2,
+          "logprob": null,
+          "text": "<bos>"
+        },
+        {
+          "id": 1841,
+          "logprob": -5.46875,
+          "text": "What"
+        },
+        {
+          "id": 603,
+          "logprob": -0.69140625,
+          "text": " is"
+        },
+        {
+          "id": 5271,
+          "logprob": -12.0,
+          "text": " deep"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.32226562,
+          "text": " learning"
+        },
+        {
+          "id": 235336,
+          "logprob": -0.33203125,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 109,
+          "logprob": -0.24707031,
+          "special": false,
+          "text": "\n\n"
+        },
+        {
+          "id": 26843,
+          "logprob": -0.14550781,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.03857422,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 603,
+          "logprob": -0.030883789,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 476,
+          "logprob": -0.020996094,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 38397,
+          "logprob": -0.828125,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 576,
+          "logprob": -0.00051498413,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 6479,
+          "logprob": -0.05883789,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.00020694733,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 674,
+          "logprob": -0.15820312,
+          "special": false,
+          "text": " that"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nDeep learning is a subset of machine learning that"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2,
+          "logprob": null,
+          "text": "<bos>"
+        },
+        {
+          "id": 1841,
+          "logprob": -5.46875,
+          "text": "What"
+        },
+        {
+          "id": 603,
+          "logprob": -0.71484375,
+          "text": " is"
+        },
+        {
+          "id": 5271,
+          "logprob": -12.0,
+          "text": " deep"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.30859375,
+          "text": " learning"
+        },
+        {
+          "id": 235336,
+          "logprob": -0.3359375,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 109,
+          "logprob": -0.23828125,
+          "special": false,
+          "text": "\n\n"
+        },
+        {
+          "id": 26843,
+          "logprob": -0.14550781,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.038330078,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 603,
+          "logprob": -0.030883789,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 476,
+          "logprob": -0.020996094,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 38397,
+          "logprob": -0.80859375,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 576,
+          "logprob": -0.0005455017,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 6479,
+          "logprob": -0.05908203,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.00020599365,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 674,
+          "logprob": -0.17285156,
+          "special": false,
+          "text": " that"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nDeep learning is a subset of machine learning that"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2,
+          "logprob": null,
+          "text": "<bos>"
+        },
+        {
+          "id": 1841,
+          "logprob": -5.46875,
+          "text": "What"
+        },
+        {
+          "id": 603,
+          "logprob": -0.71484375,
+          "text": " is"
+        },
+        {
+          "id": 5271,
+          "logprob": -12.0,
+          "text": " deep"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.30859375,
+          "text": " learning"
+        },
+        {
+          "id": 235336,
+          "logprob": -0.3359375,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 109,
+          "logprob": -0.23828125,
+          "special": false,
+          "text": "\n\n"
+        },
+        {
+          "id": 26843,
+          "logprob": -0.14550781,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.038330078,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 603,
+          "logprob": -0.030883789,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 476,
+          "logprob": -0.020996094,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 38397,
+          "logprob": -0.80859375,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 576,
+          "logprob": -0.0005455017,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 6479,
+          "logprob": -0.05908203,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.00020599365,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 674,
+          "logprob": -0.17285156,
+          "special": false,
+          "text": " that"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nDeep learning is a subset of machine learning that"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2,
+          "logprob": null,
+          "text": "<bos>"
+        },
+        {
+          "id": 1841,
+          "logprob": -5.46875,
+          "text": "What"
+        },
+        {
+          "id": 603,
+          "logprob": -0.71484375,
+          "text": " is"
+        },
+        {
+          "id": 5271,
+          "logprob": -12.0,
+          "text": " deep"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.30859375,
+          "text": " learning"
+        },
+        {
+          "id": 235336,
+          "logprob": -0.3359375,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 109,
+          "logprob": -0.23828125,
+          "special": false,
+          "text": "\n\n"
+        },
+        {
+          "id": 26843,
+          "logprob": -0.14550781,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.038330078,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 603,
+          "logprob": -0.030883789,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 476,
+          "logprob": -0.020996094,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 38397,
+          "logprob": -0.80859375,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 576,
+          "logprob": -0.0005455017,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 6479,
+          "logprob": -0.05908203,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.00020599365,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 674,
+          "logprob": -0.17285156,
+          "special": false,
+          "text": " that"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nDeep learning is a subset of machine learning that"
+  }
+]
--- a/integration-tests/models/test_compressed_tensors_w8an_fp.py
+++ b/integration-tests/models/test_compressed_tensors_w8an_fp.py
@ -0,0 +1,86 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def compressed_tensors_w8an_handle(launcher):
+    with launcher(
+        "neuralmagic/Llama-3.2-1B-Instruct-FP8",
+        num_shard=2,
+        quantize="compressed-tensors",
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def compressed_tensors_w8an(compressed_tensors_w8an_handle):
+    await compressed_tensors_w8an_handle.health(300)
+    return compressed_tensors_w8an_handle.client
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_compressed_tensors_w8an(compressed_tensors_w8an, response_snapshot):
+    response = await compressed_tensors_w8an.generate(
+        "What is deep learning?",
+        max_new_tokens=10,
+        decoder_input_details=True,
+    )
+
+    assert (
+        response.generated_text
+        == " Deep learning is a type of artificial intelligence (AI"
+    )
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.asyncio
+async def test_compressed_tensors_w8an_all_params(
+    compressed_tensors_w8an, response_snapshot
+):
+    response = await compressed_tensors_w8an.generate(
+        "What is deep learning",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        stop_sequences=["test"],
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert (
+        response.generated_text
+        == "What is deep learning?\nDeep learning, also known as neural network or"
+    )
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_compressed_tensors_w8an_load(
+    compressed_tensors_w8an, generate_load, response_snapshot
+):
+    responses = await generate_load(
+        compressed_tensors_w8an,
+        "What is deep learning?",
+        max_new_tokens=10,
+        n=4,
+    )
+
+    assert (
+        responses[0].generated_text
+        == " Deep learning is a type of artificial intelligence (AI"
+    )
+    assert len(responses) == 4
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
+
+    assert responses == response_snapshot
--- a/integration-tests/models/test_compressed_tensors_wna16_int.py
+++ b/integration-tests/models/test_compressed_tensors_wna16_int.py
@ -0,0 +1,86 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def compressed_tensors_wna16_handle(launcher):
+    with launcher(
+        "neuralmagic/gemma-2-2b-it-quantized.w4a16",
+        num_shard=2,
+        quantize="compressed-tensors",
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def compressed_tensors_wna16(compressed_tensors_wna16_handle):
+    await compressed_tensors_wna16_handle.health(300)
+    return compressed_tensors_wna16_handle.client
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_compressed_tensors_wna16(compressed_tensors_wna16, response_snapshot):
+    response = await compressed_tensors_wna16.generate(
+        "What is deep learning?",
+        max_new_tokens=10,
+        decoder_input_details=True,
+    )
+
+    assert (
+        response.generated_text
+        == "\n\nDeep learning is a subset of machine learning that"
+    )
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.asyncio
+async def test_compressed_tensors_wna16_all_params(
+    compressed_tensors_wna16, response_snapshot
+):
+    response = await compressed_tensors_wna16.generate(
+        "What is deep learning",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        stop_sequences=["test"],
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert (
+        response.generated_text
+        == "What is deep learning?\n\nDeep Learning is a subset of machine learning"
+    )
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_compressed_tensors_wna16_load(
+    compressed_tensors_wna16, generate_load, response_snapshot
+):
+    responses = await generate_load(
+        compressed_tensors_wna16,
+        "What is deep learning?",
+        max_new_tokens=10,
+        n=4,
+    )
+
+    assert (
+        responses[0].generated_text
+        == "\n\nDeep learning is a subset of machine learning that"
+    )
+    assert len(responses) == 4
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
+
+    assert responses == response_snapshot
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@ -212,6 +212,8 @@ enum Quantization {
    ///   <https://hf.co/models?search=awq>.
    /// Should replace GPTQ models wherever possible because of the better latency
    Awq,
+    /// Compressed tensors, which can be a mixture of different quantization methods.
+    CompressedTensors,
    /// 8 bit quantization, doesn't require specific model.
    /// Should be a drop-in replacement to bitsandbytes with much better performance.
    /// Kernels are from <https://github.com/NetEase-FuXi/EETQ.git>
@ -274,6 +276,9 @@ impl std::fmt::Display for Quantization {
            Quantization::Awq => {
                write!(f, "awq")
            }
+            Quantization::CompressedTensors => {
+                write!(f, "compressed-tensors")
+            }
            Quantization::Eetq => {
                write!(f, "eetq")
            }
--- a/nix/server.nix
+++ b/nix/server.nix
@ -5,6 +5,7 @@
  mypy-protobuf,
  awq-inference-engine,
  causal-conv1d,
+  compressed-tensors,
  eetq,
  einops,
  exllamav2,
@ -74,6 +75,7 @@ buildPythonPackage {
    awq-inference-engine
    eetq
    causal-conv1d
+    compressed-tensors
    einops
    exllamav2
    flashinfer
--- a/server/Makefile
+++ b/server/Makefile
@ -23,7 +23,7 @@ gen-server:
 install-server: gen-server
 	pip install pip --upgrade
 	pip install -r requirements_cuda.txt
-	pip install -e ".[accelerate, quantize, peft, outlines]"
+	pip install -e ".[accelerate, compressed-tensors, quantize, peft, outlines]"


 install: install-cuda
--- a/server/poetry.lock
+++ b/server/poetry.lock
@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.

 [[package]]
 name = "accelerate"
@ -388,6 +388,26 @@ files = [
    {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
 ]

+[[package]]
+name = "compressed-tensors"
+version = "0.7.1"
+description = "Library for utilization of compressed safetensors of neural network models"
+optional = true
+python-versions = "*"
+files = [
+    {file = "compressed-tensors-0.7.1.tar.gz", hash = "sha256:3c7865ebfe4ea76ae94d7c674bcf93aedd2064571f682c09a377a219d5ebb3a0"},
+    {file = "compressed_tensors-0.7.1-py3-none-any.whl", hash = "sha256:22d11558a70f655ae647db9c8e9fb14a5e9d6983ca5aec3f267518625fd6dd0e"},
+]
+
+[package.dependencies]
+pydantic = ">=2.0"
+torch = ">=1.7.0"
+transformers = "*"
+
+[package.extras]
+accelerate = ["accelerate"]
+dev = ["black (==22.12.0)", "flake8 (>=3.8.3)", "isort (==5.8.0)", "nbconvert (>=7.16.3)", "pytest (>=6.0.0)", "wheel (>=0.36.2)"]
+
 [[package]]
 name = "datasets"
 version = "2.21.0"
@ -3982,4 +4002,4 @@ torch = ["torch"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.13"
-content-hash = "b39033e573f50a0f046787aebf1702d86673aad0b2fcee818404fcea7f644b81"
+content-hash = "4636689efd4c94559c3c23903aafcffd177533a3b9006b3b4f8491b158a3a754"
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@ -37,6 +37,7 @@ pillow = "^10.0.0"
 outlines= { version = "^0.0.34", optional = true }
 prometheus-client = "^0.20.0"
 py-cpuinfo = "^9.0.0"
+compressed-tensors = { version = "^0.7.1", optional = true }
 # Remove later, temporary workaround for outlines.
 numpy = "^1.26"

@ -58,6 +59,7 @@ rich = "^13.7.1"
 torch = ["torch"]
 accelerate = ["accelerate"]
 bnb = ["bitsandbytes"]
+compressed-tensors = ["compressed-tensors"]
 marlin = ["marlin-kernels"]
 moe = ["moe-kernels"]
 peft = ["peft"]
--- a/server/text_generation_server/cli.py
+++ b/server/text_generation_server/cli.py
@ -19,6 +19,7 @@ class Quantization(str, Enum):
    bitsandbytes_fp4 = "bitsandbytes-fp4"
    gptq = "gptq"
    awq = "awq"
+    compressed_tensors = "compressed-tensors"
    eetq = "eetq"
    exl2 = "exl2"
    fp8 = "fp8"
--- a/server/text_generation_server/layers/compressed_tensors/init.py
+++ b/server/text_generation_server/layers/compressed_tensors/init.py
@ -0,0 +1,3 @@
+from .loader import CompressedTensorsLoader
+
+__all__ = ["CompressedTensorsLoader"]
--- a/server/text_generation_server/layers/compressed_tensors/loader.py
+++ b/server/text_generation_server/layers/compressed_tensors/loader.py
@ -0,0 +1,174 @@
+from typing import Any, Dict, List, Union
+
+from compressed_tensors import QuantizationConfig, QuantizationStatus
+from compressed_tensors.config import CompressionFormat
+from compressed_tensors.quantization import (
+    QuantizationScheme,
+    QuantizationType,
+    find_name_or_class_matches,
+)
+from loguru import logger
+from pydantic import ValidationError
+from torch import nn
+
+from text_generation_server.layers.compressed_tensors.w8an_fp import W8ANFpLoader
+from text_generation_server.layers.compressed_tensors.wna16_int import WNA16Loader
+from text_generation_server.utils.log import log_once
+from text_generation_server.utils.weights import (
+    DefaultWeightsLoader,
+    UnquantizedWeight,
+    Weights,
+    WeightsLoader,
+)
+
+# compressed-tensors can match modules as quantization targets. However,
+# they need to be objects rather than classes or class names. Since we
+# need to match `Linear` targets, make an instance that can be re-used.
+_EMPTY_LINEAR: nn.Module = nn.Linear(0, 0)
+
+
+class CompressedTensorsLoader(WeightsLoader):
+    """Loader for checkpoints stored in the compressed-tensors format."""
+
+    def __init__(self, config: Dict[str, Any]):
+        quantization_config_raw = config.get("quantization_config")
+        if quantization_config_raw is None:
+            # `compression_config` was renamed to `quantization_config`; support
+            # retained for backward compatibility.
+            quantization_config_raw = config.get("compression_config")
+        if quantization_config_raw is None:
+            raise ValueError(
+                "Checkpoint does not have compressed-tensors configuration"
+            )
+
+        try:
+            quantization_config = QuantizationConfig.model_validate(
+                quantization_config_raw
+            )
+        except ValidationError as e:
+            raise ValueError("Cannot parse compressed-tensors configuration") from e
+
+        if quantization_config.quantization_status not in (
+            QuantizationStatus.COMPRESSED,
+            QuantizationStatus.FROZEN,
+        ):
+            raise ValueError(
+                f"Model quantization was not finished, status was: {quantization_config.quantization_status}"
+            )
+
+        self.ignore = (
+            quantization_config.ignore if quantization_config.ignore is not None else []
+        )
+        self.loaders = self._get_target_loaders(quantization_config)
+
+        for target, loader in self.loaders.items():
+            log_once(
+                logger.info,
+                f"Using {loader} for compressed-tensors target '{target}'",
+            )
+
+    def get_weights(self, weights: Weights, prefix: str):
+        loader = self._lookup_loader(prefix)
+        return loader.get_weights(weights, prefix)
+
+    def get_weights_col_packed(
+        self,
+        weights: "Weights",
+        prefix: str,
+        block_sizes: Union[int, List[int]],
+    ):
+        loader = self._lookup_loader(prefix)
+        return loader.get_weights_col_packed(weights, prefix, block_sizes)
+
+    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
+        loader = self._lookup_loader(prefixes[0])
+        return loader.get_multi_weights_col(weights, prefixes, dim)
+
+    def get_weights_row(self, weights: Weights, prefix: str):
+        loader = self._lookup_loader(prefix)
+        return loader.get_weights_row(weights, prefix)
+
+    def _get_target_loaders(
+        self, quantization_config: QuantizationConfig
+    ) -> Dict[str, WeightsLoader]:
+        """
+        A compressed-tensors checkpoint can use different quantizations
+        for different targets. This method returns a dictionary with a
+        loader per target.
+        """
+
+        loaders: Dict[str, WeightsLoader] = {}
+
+        format = quantization_config.format
+
+        for group_name, group in quantization_config.config_groups.items():
+            # The group configuration can be a string, but does that ever
+            # happen in a serialized quantization config?
+            assert isinstance(group, QuantizationScheme)
+
+            loader = self._create_loader_for_group(format, group_name, group)
+
+            # A quantized parameter group can have multiple targets, add the
+            # loader for all the targets.
+            for target in group.targets:
+                if target in loaders:
+                    raise ValueError(
+                        f"Target '{target} has multiple configured loaders'"
+                    )
+                loaders[target] = loader
+
+        return loaders
+
+    def _create_loader_for_group(
+        self, format: str, group_name: str, group: QuantizationScheme
+    ) -> WeightsLoader:
+        """
+        Find and create a loader for the group with the given quantization
+        scheme.
+        """
+        # NOTE: we ignore group.output_activations because we don't support
+        #       output quantization yet.
+
+        input_activations = group.input_activations
+        weights = group.weights
+        if (
+            format
+            in {
+                CompressionFormat.float_quantized.value,
+                CompressionFormat.naive_quantized.value,
+            }
+            and weights is not None
+            and weights.type == QuantizationType.FLOAT
+            and weights.num_bits == 8
+        ):
+            # FP W8A8 or W8A16.
+            return W8ANFpLoader(input_activations=input_activations, weights=weights)
+        elif (
+            format == CompressionFormat.pack_quantized.value
+            and weights is not None
+            and weights.type == QuantizationType.INT
+            and weights.num_bits in (4, 8)
+        ):
+            # INT W4A16 or W8A16 (GPTQ/AWQ-like).
+            return WNA16Loader(weights)
+        else:
+            raise ValueError(
+                f"Group '{group_name}' has unsupported compressed-tensors configurtion"
+            )
+
+    def _lookup_loader(self, prefix: str) -> WeightsLoader:
+        """
+        Look up the loader to use for a given parameter name (prefix).
+        """
+
+        if len(find_name_or_class_matches(prefix, _EMPTY_LINEAR, self.ignore)) > 0:
+            return DefaultWeightsLoader(UnquantizedWeight)
+
+        # We currently only handle linear layers, so unconditionally pass
+        # a `Linear` instance.
+        targets = find_name_or_class_matches(prefix, _EMPTY_LINEAR, self.loaders.keys())
+        if len(targets) == 0:
+            raise ValueError(
+                f"Cannot find compressed-tensors target for prefix: {prefix}"
+            )
+        return self.loaders[targets[0]]
--- a/server/text_generation_server/layers/compressed_tensors/w8an_fp.py
+++ b/server/text_generation_server/layers/compressed_tensors/w8an_fp.py
@ -0,0 +1,174 @@
+from typing import List, Optional, Union
+
+import torch
+from compressed_tensors.quantization import QuantizationArgs, QuantizationType
+
+from text_generation_server.layers.fp8 import Fp8Weight, _load_scalar_or_matrix_scale
+from text_generation_server.utils.weights import Weights, WeightsLoader
+
+
+class W8ANFpLoader(WeightsLoader):
+    """
+    Loader for W8A8/W8A16 FP compressed-tensors parameters.
+    """
+
+    def __init__(
+        self,
+        *,
+        input_activations: Optional[QuantizationArgs],
+        weights: QuantizationArgs,
+    ):
+        assert weights.type == QuantizationType.FLOAT and weights.num_bits == 8
+
+        # We ignore the `strategy` option which sets the scales to be
+        # per-tensor, per-channel or per-token. What scales are supported
+        # is dependent on the kernels used (e.g. cutlass can do tokenwise,
+        # Torch cannot, and FP8-Marlin does not quantize inputs at all).
+        # So, instead we try to use the best-possible configuration.
+
+        self.load_weight_scale = not weights.dynamic
+        self.load_input_scale = (
+            input_activations is not None and not input_activations.dynamic
+        )
+        self.force_w8a16 = (
+            input_activations is not None and input_activations.num_bits == 16
+        )
+
+    def __str__(self) -> str:
+        def scale_to_str(scale):
+            return "static" if scale else "dynamic"
+
+        quantization_type = f"W8A{16 if self.force_w8a16 else 8}"
+
+        return f"{self.__class__.__name__} ({quantization_type}, weight: {scale_to_str(self.load_weight_scale)}, input: {scale_to_str(self.load_input_scale)})"
+
+    def get_weights(self, weights: "Weights", prefix: str):
+        w = weights.get_tensor(f"{prefix}.weight")
+
+        weight_scale = None
+        if self.load_weight_scale:
+            weight_scale = (
+                weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
+                .reshape(-1)
+                .expand(w.shape[0])
+            )
+
+        input_scale = None
+        if self.load_input_scale:
+            input_scale = weights.get_tensor(
+                f"{prefix}.input_scale", to_dtype=False
+            ).reshape(-1)
+
+        return Fp8Weight(
+            weight=w,
+            weight_scale=weight_scale,
+            input_scale=input_scale,
+            dtype=weights.dtype,
+            force_w8a16=self.force_w8a16,
+        )
+
+    def get_weights_col_packed(
+        self,
+        weights: Weights,
+        prefix: str,
+        block_sizes: Union[int, List[int]],
+    ):
+        w = weights.get_packed_sharded(
+            f"{prefix}.weight", dim=0, block_sizes=block_sizes
+        )
+
+        weight_scale = None
+        if self.load_weight_scale:
+            weight_scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
+            if weight_scale.numel() > 1:
+                weight_scale = weights.get_packed_sharded(
+                    f"{prefix}.weight_scale",
+                    dim=0,
+                    block_sizes=block_sizes,
+                    to_dtype=False,
+                )
+            weight_scale = weight_scale.reshape(-1).expand(w.shape[0])
+
+        input_scale = None
+        if self.load_input_scale:
+            input_scale = weights.get_tensor(f"{prefix}.input_scale", to_dtype=False)
+            if input_scale.numel() > 1:
+                input_scale = weights.get_packed_sharded(
+                    f"{prefix}.input_scale",
+                    dim=0,
+                    block_sizes=block_sizes,
+                    to_dtype=False,
+                )
+            input_scale = input_scale.reshape(-1).max()
+
+        return Fp8Weight(
+            weight=w,
+            weight_scale=weight_scale,
+            input_scale=input_scale,
+            dtype=weights.dtype,
+            force_w8a16=self.force_w8a16,
+        )
+
+    def get_multi_weights_col(self, weights: "Weights", prefixes: List[str], dim: int):
+        # FIXME: Force to_device to false as fp8 weights do not support torch.cat on device yet
+        w = [
+            weights.get_sharded(f"{p}.weight", dim=0, to_device=False) for p in prefixes
+        ]
+        shapes = [x.shape for x in w]
+
+        # Concat then send to the device
+        w = torch.cat(w, dim=dim).to(weights.device)
+
+        weight_scale = None
+        if self.load_weight_scale:
+            weight_scale = [
+                _load_scalar_or_matrix_scale(weights, f"{p}.weight_scale", shape)
+                for p, shape in zip(prefixes, shapes)
+            ]
+            weight_scale = torch.cat(weight_scale, dim=0).reshape(-1)
+
+        input_scale = None
+        if self.load_input_scale:
+            input_scale = [
+                _load_scalar_or_matrix_scale(weights, f"{p}.input_scale", shape)
+                for p, shape in zip(prefixes, shapes)
+                if weights.has_tensor(f"{p}.input_scale")
+            ]
+            assert len(input_scale) == 0 or len(input_scale) == len(prefixes)
+            input_scale = (
+                torch.cat(input_scale, dim=0).reshape(-1).max()
+                if len(input_scale) != 0
+                else None
+            )
+
+        return Fp8Weight(
+            weight=w,
+            weight_scale=weight_scale,
+            input_scale=input_scale,
+            dtype=weights.dtype,
+            force_w8a16=self.force_w8a16,
+        )
+
+    def get_weights_row(self, weights: "Weights", prefix: str):
+        w = weights.get_sharded(f"{prefix}.weight", dim=1)
+        weight_scale = None
+        if self.load_weight_scale:
+            weight_scale = (
+                weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
+                .reshape(-1)
+                .expand(w.shape[0])
+            )
+
+        input_scale = None
+        if self.load_input_scale:
+            input_scale = weights.get_tensor(
+                f"{prefix}.input_scale", to_dtype=False
+            ).reshape(-1)
+
+        return Fp8Weight(
+            weight=w,
+            weight_scale=weight_scale,
+            input_scale=input_scale,
+            dtype=weights.dtype,
+            force_w8a16=self.force_w8a16,
+        )
--- a/server/text_generation_server/layers/compressed_tensors/wna16_int.py
+++ b/server/text_generation_server/layers/compressed_tensors/wna16_int.py
@ -0,0 +1,188 @@
+from typing import List, Union
+
+import torch
+from compressed_tensors.quantization import ActivationOrdering, QuantizationArgs
+from loguru import logger
+
+from text_generation_server.layers.marlin.gptq import repack_gptq_for_marlin
+from text_generation_server.utils.log import log_once
+from text_generation_server.utils.weights import Weights, WeightsLoader
+
+
+class WNA16Loader(WeightsLoader):
+    """
+    Loader for W4A16/W8A16 INT compressed-tensors parameters.
+    """
+
+    def __init__(self, weights: QuantizationArgs):
+        self.weights = weights
+        self.desc_act = self.weights.actorder == ActivationOrdering.GROUP
+        self.groupsize = (
+            -1 if self.weights.group_size is None else self.weights.group_size
+        )
+
+    def __str__(self) -> str:
+        quantization_type = f"W{self.weights.num_bits}8A16"
+
+        return f"{self.__class__.__name__} ({quantization_type})"
+
+    def get_weights(self, weights: Weights, prefix: str):
+        log_once(logger.info, "Using GPTQ-Marlin kernels")
+        try:
+            weight_packed = weights.get_tensor(f"{prefix}.weight_packed").t()
+        except RuntimeError:
+            raise RuntimeError(
+                f"Cannot load w{self.weights.num_bits}a16 weight, make sure the model is already quantized"
+            )
+
+        zero_point = None
+        if not self.weights.symmetric:
+            zero_point = weights.get_tensor(f"{prefix}.weight_zero_point").t()
+
+        g_idx = None
+        if self.desc_act:
+            g_idx = weights.get_tensor(f"{prefix}.weight_g_idx")
+
+        scales = weights.get_tensor(f"{prefix}.weight.scales").t()
+
+        return repack_gptq_for_marlin(
+            qweight=weight_packed.contiguous(),
+            scales=scales,
+            qzeros=zero_point,
+            g_idx=g_idx,
+            bits=self.weights.num_bits,
+            desc_act=self.desc_act,
+            groupsize=self.groupsize,
+            quant_method="compressed-tensors",
+            sym=self.weights.symmetric,
+            sharded_infeatures=False,
+        )
+
+    def get_weights_col_packed(
+        self,
+        weights: Weights,
+        prefix: str,
+        block_sizes: Union[int, List[int]],
+    ):
+        try:
+            weight_packed = weights.get_packed_sharded(
+                f"{prefix}.weight_packed", dim=0, block_sizes=block_sizes
+            ).t()
+        except RuntimeError:
+            raise RuntimeError(
+                f"Cannot load w{self.weights.num_bits}a16 weight, make sure the model is already quantized"
+            )
+        scales = weights.get_packed_sharded(
+            f"{prefix}.weight_scale", dim=0, block_sizes=block_sizes
+        ).t()
+        scales = scales.to(dtype=weights.dtype)
+
+        zero_point = None
+        if not self.weights.symmetric:
+            zero_point = weights.get_packed_sharded(
+                f"{prefix}.qzeros", dim=0, block_sizes=block_sizes
+            ).t()
+
+        g_idx = None
+        if self.desc_act:
+            g_idx = weights.get_tensor(f"{prefix}.g_idx")
+
+        return repack_gptq_for_marlin(
+            qweight=weight_packed.contiguous(),
+            scales=scales,
+            qzeros=zero_point,
+            g_idx=g_idx,
+            bits=self.weights.num_bits,
+            desc_act=self.desc_act,
+            groupsize=self.groupsize,
+            quant_method="compressed-tensors",
+            sym=self.weights.symmetric,
+            sharded_infeatures=False,
+        )
+
+    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
+        try:
+            weight_packed = torch.cat(
+                [
+                    weights.get_sharded(f"{p}.weight_packed", dim=0).t()
+                    for p in prefixes
+                ],
+                dim=1,
+            )
+        except RuntimeError:
+            raise RuntimeError(
+                f"Cannot load w{self.weights.num_bits}a16 weight, make sure the model is already quantized"
+            )
+
+        scales = torch.cat(
+            [weights.get_sharded(f"{p}.weight_scale", dim=0).t() for p in prefixes],
+            dim=1,
+        )
+
+        zero_point = None
+        if not self.weights.symmetric:
+            zero_point = torch.cat(
+                [weights.get_sharded(f"{p}.qzeros", dim=0).t() for p in prefixes], dim=1
+            ).t()
+
+        g_idx = None
+        if self.desc_act:
+            w = [weights.get_tensor(f"{p}.g_idx") for p in prefixes]
+            for w2 in w[1:]:
+                torch.testing.assert_close(w2, w[0])
+            g_idx = w[0]
+
+        return repack_gptq_for_marlin(
+            qweight=weight_packed.contiguous(),
+            scales=scales,
+            qzeros=zero_point,
+            g_idx=g_idx,
+            bits=self.weights.num_bits,
+            desc_act=self.desc_act,
+            groupsize=self.groupsize,
+            quant_method="compressed-tensors",
+            sym=self.weights.symmetric,
+            sharded_infeatures=False,
+        )
+
+    def get_weights_row(self, weights: Weights, prefix: str):
+        log_once(logger.info, "Using GPTQ-Marlin kernels")
+        try:
+            weight_packed = weights.get_sharded(f"{prefix}.weight_packed", dim=1).t()
+        except RuntimeError:
+            raise RuntimeError(
+                f"Cannot load `{self.quantize}` weight, make sure the model is already quantized."
+            )
+
+        zero_point = None
+        if not self.weights.symmetric:
+            if self.desc_act or self.groupsize == -1:
+                zero_point = weights.get_tensor(f"{prefix}.weight_zero_point").t()
+            else:
+                zero_point = weights.get_sharded(
+                    f"{prefix}.weight_zero_point", dim=1
+                ).t()
+
+        g_idx = None
+        if self.desc_act:
+            g_idx = weights.get_sharded(f"{prefix}.g_idx", dim=0)
+
+        if self.desc_act or self.groupsize == -1:
+            scales = weights.get_tensor(f"{prefix}.weight_scale").t()
+        else:
+            scales = weights.get_sharded(f"{prefix}.weight_scale", dim=1).t()
+
+        sharded_in_features = weights.process_group.size() > 1
+
+        return repack_gptq_for_marlin(
+            qweight=weight_packed.contiguous(),
+            scales=scales,
+            qzeros=zero_point,
+            g_idx=g_idx,
+            bits=self.weights.num_bits,
+            desc_act=self.desc_act,
+            groupsize=self.groupsize,
+            quant_method="compressed-tensors",
+            sym=self.weights.symmetric,
+            sharded_infeatures=sharded_in_features,
+        )
--- a/server/text_generation_server/layers/fp8.py
+++ b/server/text_generation_server/layers/fp8.py
@ -29,7 +29,7 @@ else:
    CUTLASS_FP8_AVAILABLE = False


-def get_fp8_linear() -> Type[torch.nn.Module]:
+def get_fp8_linear(force_w8a16: bool = False) -> Type[torch.nn.Module]:
    """
    Return an FP8 linear `Module` that is compatible with the current system.
    """
@ -37,7 +37,14 @@ def get_fp8_linear() -> Type[torch.nn.Module]:
    if SYSTEM == "cuda":

        major, _ = torch.cuda.get_device_capability()
-        if major == 8 and os.getenv("USE_CUTLASS_W8A8", "0") != "1":
+        # Marlin is W8A16, use it when:
+        #
+        # - On capability 8.x where x < 8: W8A8 FP8 GEMM is not supported.
+        # - On capability 8.9: W8A8 FP8 GEMM is supported, but Marlin-FP8 is faster.
+        # - On capability 9.x when force_w8a16: cutlass kernels do not support W8A16.
+        if (major == 8 or (major == 9 and force_w8a16)) and os.getenv(
+            "USE_CUTLASS_W8A8", "0"
+        ) != "1":
            # NOTE: Capability 8.9 is supported by cutlass kernels, but FP8-Marlin
            #       gives better decoding throughput on L4 and L40.
            from text_generation_server.layers.marlin import GPTQMarlinFP8Linear
@ -283,14 +290,17 @@ class Fp8Weight(Weight):
    weight_scale: Optional[torch.Tensor] = None
    input_scale: Optional[torch.Tensor] = None
    activation_scale_ub: Optional[float] = None
+    force_w8a16: bool = False

    def get_linear(self, bias: torch.Tensor):
        if self.weight_scale is None:
-            return get_fp8_linear().from_unquant(self.weight, bias, self.dtype)
+            return get_fp8_linear(force_w8a16=self.force_w8a16).from_unquant(
+                self.weight, bias, self.dtype
+            )
        # This is not checked by the fbgemm kernels, but they require contiguous
        # memory. Can be non-contiguous when we e.g. expand from scalars.
        self.weight_scale = self.weight_scale.contiguous()
-        return get_fp8_linear().from_fp8(
+        return get_fp8_linear(force_w8a16=self.force_w8a16).from_fp8(
            weight=self.weight,
            scale=self.weight_scale,
            dtype=self.dtype,
--- a/server/text_generation_server/layers/marlin/gptq.py
+++ b/server/text_generation_server/layers/marlin/gptq.py
@ -261,7 +261,7 @@ class GPTQMarlinWeight(Weight):

    def __post_init__(self):
        assert self.qweight.dtype == torch.int32
-        assert self.scales.dtype == torch.float16
+        assert self.scales.dtype in (torch.float16, torch.bfloat16)
        assert self.g_idx.dtype == torch.int32
        assert self.perm.dtype == torch.int32

@ -300,7 +300,7 @@ def repack_gptq_for_marlin(
        raise RuntimeError(
            f"Repacking GPTQ weights with group size {groupsize} as Marlin is not supported, must be one of: {supported_sizes}"
        )
-    if not (sym or quant_method == "awq"):
+    if not (sym or quant_method == "awq" or quant_method == "compressed-tensors"):
        raise RuntimeError(
            "Repacking GPTQ weights with asymmetric quantization as Marlin is not supported."
        )
--- a/server/text_generation_server/models/init.py
+++ b/server/text_generation_server/models/init.py
@ -370,46 +370,23 @@ def get_model(
    compression_config = config_dict.get("compression_config", None)
    if quantization_config is not None and quantize is None:
        method = quantization_config.get("quant_method", None)
-        config_groups = quantization_config.get("config_groups", None)
        if method in {"gptq", "awq", "exl2"}:
            log_master(logger.info, f"Auto selecting quantization method {method}")
            quantize = method
        elif method == "fbgemm_fp8" or method == "fp8":
            log_master(logger.info, "Auto selecting quantization method fp8")
            quantize = "fp8"
-        elif config_groups is not None:
-            # TODO: at some point we should probably fully parse the compression
-            # configuration to know which parameters are compressed.
-            for _, group in config_groups.items():
-                weights_config = group.get("weights")
-                if weights_config is not None:
-                    if (
-                        weights_config["type"] == "float"
-                        and weights_config["num_bits"] == 8
-                    ):
+        if method == "compressed-tensors":
            log_master(
-                            logger.info, "Auto selecting quantization method fp8"
+                logger.info, "Auto selecting quantization method compressed-tensors"
            )
-                        quantize = "fp8"
-                        break
+            quantize = "compressed-tensors"
        else:
            log_master(logger.warning, f"Unknown quantization method {method}")
    elif compression_config is not None:
        # `compression_config` renamed to `quantization_config`; support retained for backward compatibility.
-        config_groups = compression_config.get("config_groups")
-        if config_groups is not None:
-            for _, group in config_groups.items():
-                weights_config = group.get("weights")
-                if weights_config is not None:
-                    if (
-                        weights_config["type"] == "float"
-                        and weights_config["num_bits"] == 8
-                    ):
-                        log_master(
-                            logger.info, "Auto selecting quantization method fp8"
-                        )
-                        quantize = "fp8"
-                        break
+        log_master(logger.info, "Auto selecting quantization method compressed-tensors")
+        quantize = "compressed-tensors"

    if dtype is None:
        if quantize in ["awq", "exl2", "gptq", "marlin"]:
--- a/server/text_generation_server/utils/quantization.py
+++ b/server/text_generation_server/utils/quantization.py
@ -27,7 +27,20 @@ class _FP8QuantizerConfig:
    activation_scale_ub: float


-# We should probably do this with Pytantic JSON deserialization,
+def _get_config_json(model_id: str, revision: Optional[str], filename: str):
+    if os.path.exists(
+        os.path.join(
+            model_id,
+        )
+    ):
+        filename = os.path.join(model_id, filename)
+    else:
+        filename = hf_hub_download(model_id, filename=filename, revision=revision)
+    with open(filename, "r") as f:
+        return json.load(f)
+
+
+# We should probably do this with Pydantic JSON deserialization,
 # but for now we'll stay close to the old _set_gptq_params.
 def _get_quantizer_config(model_id, revision):
    bits = 4
@ -39,12 +52,7 @@ def _get_quantizer_config(model_id, revision):

    filename = "config.json"
    try:
-        if os.path.exists(os.path.join(model_id, filename)):
-            filename = os.path.join(model_id, filename)
-        else:
-            filename = hf_hub_download(model_id, filename=filename, revision=revision)
-        with open(filename, "r") as f:
-            data = json.load(f)
+        data = _get_config_json(model_id, revision, filename)

        # FP8 config
        if data["quantization_config"]["quant_method"] == "fbgemm_fp8":
@ -67,14 +75,7 @@ def _get_quantizer_config(model_id, revision):
    except Exception:
        filename = "quantize_config.json"
        try:
-            if os.path.exists(os.path.join(model_id, filename)):
-                filename = os.path.join(model_id, filename)
-            else:
-                filename = hf_hub_download(
-                    model_id, filename=filename, revision=revision
-                )
-            with open(filename, "r") as f:
-                data = json.load(f)
+            data = _get_config_json(model_id, revision, filename)
            bits = data["bits"]
            groupsize = data["group_size"]

@ -90,14 +91,7 @@ def _get_quantizer_config(model_id, revision):
        except Exception:
            filename = "quant_config.json"
            try:
-                if os.path.exists(os.path.join(model_id, filename)):
-                    filename = os.path.join(model_id, filename)
-                else:
-                    filename = hf_hub_download(
-                        model_id, filename=filename, revision=revision
-                    )
-                with open(filename, "r") as f:
-                    data = json.load(f)
+                data = _get_config_json(model_id, revision, filename)
                bits = data["w_bit"]
                groupsize = data["q_group_size"]
                desc_act = data["desc_act"]
@ -119,6 +113,14 @@ def _get_quantizer_config(model_id, revision):
 def get_loader(
    quantize: Optional[str], model_id: str, revision: Optional[str]
 ) -> WeightsLoader:
+    if quantize == "compressed-tensors":
+        config = _get_config_json(model_id, revision, "config.json")
+        from text_generation_server.layers.compressed_tensors import (
+            CompressedTensorsLoader,
+        )
+
+        return CompressedTensorsLoader(config)
+
    quantizer_config = _get_quantizer_config(model_id, revision)
    if quantize in {"awq", "gptq"}:
        from text_generation_server.layers.gptq import GPTQWeightsLoader