Add initial support for compressed-tensors checkpoints (#2732)

compressed-tensors is a safetensors extension for sparse, quantized tensors. The format is more powerful than earlier AWQ/GPTQ/FP8 quantization, because - Different quantizer configurations can be used for different targets. - The format can specify input/output quantizers in addition to weight quantizers. - Configurable exclusions for quantization. This change adds a dependency on the `compressed-tensors` package for its configuration parsing and layer matching functionality. The following types of quantization are supported in this PR: - W8A16 and W4A16 INT using GPTQ-Marlin kernels. - W8A8 and W8A16 FP using FP8-Marlin and cutlass kernels. Support for other quantization types will be added in subsequent PRs.
2024-11-10 13:54:07 +01:00 · 2024-11-10 13:54:07 +01:00 · a785000842
parent 97f7a22f0b
commit a785000842
28 changed files with 2052 additions and 78 deletions
--- a/2
+++ b/2
@ -247,7 +247,7 @@ COPY server/Makefile server/Makefile
 RUN cd server && \
    make gen-server && \
    pip install -r requirements_cuda.txt && \
-    pip install ".[bnb, accelerate, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
+    pip install ".[bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
    pip install nvidia-nccl-cu12==2.22.3
 ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
--- a/2
+++ b/2
@ -296,7 +296,7 @@ COPY server/Makefile server/Makefile
 RUN cd server && \
    make gen-server && \
    pip install -r requirements_rocm.txt && \
-    pip install ".[accelerate, peft, outlines]" --no-cache-dir
+    pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
 # Install benchmarker
 COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
--- a/2
+++ b/2
@ -102,7 +102,7 @@ COPY server/Makefile server/Makefile
 RUN cd server && \
    make gen-server && \
    pip install -r requirements_intel.txt && \
-    pip install ".[accelerate, peft, outlines]" --no-cache-dir
+    pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
 ENV CCL_ROOT=/opt/intel/oneapi/ccl/latest
 ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest
--- a/docs/source/reference/launcher.md
+++ b/docs/source/reference/launcher.md
@ -63,6 +63,7 @@ Options:
          Possible values:
          - awq:                4 bit quantization. Requires a specific AWQ quantized model: <https://hf.co/models?search=awq>. Should replace GPTQ models wherever possible because of the better latency
          - compressed-tensors: Compressed tensors, which can be a mixture of different quantization methods
          - eetq:               8 bit quantization, doesn't require specific model. Should be a drop-in replacement to bitsandbytes with much better performance. Kernels are from <https://github.com/NetEase-FuXi/EETQ.git>
          - exl2:               Variable bit quantization. Requires a specific EXL2 quantized model: <https://hf.co/models?search=exl2>. Requires exllama2 kernels and does not support tensor parallelism (num_shard > 1)
          - gptq:               4 bit quantization. Requires a specific GTPQ quantized model: <https://hf.co/models?search=gptq>. text-generation-inference will use exllama (faster) kernels wherever possible, and use triton kernel (wider support) when it's not. AWQ has faster kernels
--- a/flake.lock
+++ b/flake.lock
@ -978,15 +978,16 @@
        "nixpkgs": "nixpkgs_6"
      },
      "locked": {
-        "lastModified": 1730724647,
+        "lastModified": 1730795478,
-        "narHash": "sha256-SVv+50CGaCoU4zZwsg6ZAaOi/D5QJBL1P2SIB+3CEf4=",
+        "narHash": "sha256-xpkXDKnkhXO4F6Ea3reHmqwXXRzQe2PsxdRQFPCViWs=",
        "owner": "huggingface",
        "repo": "text-generation-inference-nix",
-        "rev": "1512898a1e5ad9eff025205fa9c4d33a44506cf3",
+        "rev": "b7f6c07867d94d6e55f5352573a6b3dad1c88e56",
        "type": "github"
      },
      "original": {
        "owner": "huggingface",
        "ref": "compressed-tensors-0.7.1",
        "repo": "text-generation-inference-nix",
        "type": "github"
      }
--- a/flake.nix
+++ b/flake.nix
@ -5,7 +5,7 @@
      inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
    };
    nix-filter.url = "github:numtide/nix-filter";
-    tgi-nix.url = "github:huggingface/text-generation-inference-nix";
+    tgi-nix.url = "github:huggingface/text-generation-inference-nix/compressed-tensors-0.7.1";
    nixpkgs.follows = "tgi-nix/nixpkgs";
    flake-utils.url = "github:numtide/flake-utils";
    rust-overlay = {
--- a/integration-tests/models/snapshots/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an.json
@ -0,0 +1,104 @@
 {
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [
      {
        "id": 128000,
        "logprob": null,
        "text": "<|begin_of_text|>"
      },
      {
        "id": 3923,
        "logprob": -7.609375,
        "text": "What"
      },
      {
        "id": 374,
        "logprob": -0.92529297,
        "text": " is"
      },
      {
        "id": 5655,
        "logprob": -10.0,
        "text": " deep"
      },
      {
        "id": 6975,
        "logprob": -0.94628906,
        "text": " learning"
      },
      {
        "id": 30,
        "logprob": -2.9042969,
        "text": "?"
      }
    ],
    "seed": null,
    "tokens": [
      {
        "id": 18682,
        "logprob": -0.8769531,
        "special": false,
        "text": " Deep"
      },
      {
        "id": 6975,
        "logprob": -0.0076942444,
        "special": false,
        "text": " learning"
      },
      {
        "id": 374,
        "logprob": -0.25073242,
        "special": false,
        "text": " is"
      },
      {
        "id": 264,
        "logprob": -0.097595215,
        "special": false,
        "text": " a"
      },
      {
        "id": 955,
        "logprob": -0.921875,
        "special": false,
        "text": " type"
      },
      {
        "id": 315,
        "logprob": -0.00027918816,
        "special": false,
        "text": " of"
      },
      {
        "id": 21075,
        "logprob": -0.5527344,
        "special": false,
        "text": " artificial"
      },
      {
        "id": 11478,
        "logprob": -0.042541504,
        "special": false,
        "text": " intelligence"
      },
      {
        "id": 320,
        "logprob": -0.38891602,
        "special": false,
        "text": " ("
      },
      {
        "id": 15836,
        "logprob": -0.0011043549,
        "special": false,
        "text": "AI"
      }
    ],
    "top_tokens": null
  },
  "generated_text": " Deep learning is a type of artificial intelligence (AI"
 }
--- a/integration-tests/models/snapshots/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_all_params.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_all_params.json
@ -0,0 +1,99 @@
 {
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [
      {
        "id": 128000,
        "logprob": null,
        "text": "<|begin_of_text|>"
      },
      {
        "id": 3923,
        "logprob": -7.609375,
        "text": "What"
      },
      {
        "id": 374,
        "logprob": -0.92529297,
        "text": " is"
      },
      {
        "id": 5655,
        "logprob": -10.0,
        "text": " deep"
      },
      {
        "id": 6975,
        "logprob": -0.94628906,
        "text": " learning"
      }
    ],
    "seed": 0,
    "tokens": [
      {
        "id": 5380,
        "logprob": -0.23840332,
        "special": false,
        "text": "?\n"
      },
      {
        "id": 34564,
        "logprob": 0.0,
        "special": false,
        "text": "Deep"
      },
      {
        "id": 6975,
        "logprob": 0.0,
        "special": false,
        "text": " learning"
      },
      {
        "id": 11,
        "logprob": 0.0,
        "special": false,
        "text": ","
      },
      {
        "id": 1101,
        "logprob": -1.2011719,
        "special": false,
        "text": " also"
      },
      {
        "id": 3967,
        "logprob": 0.0,
        "special": false,
        "text": " known"
      },
      {
        "id": 439,
        "logprob": 0.0,
        "special": false,
        "text": " as"
      },
      {
        "id": 30828,
        "logprob": 0.0,
        "special": false,
        "text": " neural"
      },
      {
        "id": 4009,
        "logprob": -0.6777344,
        "special": false,
        "text": " network"
      },
      {
        "id": 477,
        "logprob": 0.0,
        "special": false,
        "text": " or"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "What is deep learning?\nDeep learning, also known as neural network or"
 }
--- a/integration-tests/models/snapshots/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_load.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_load.json
@ -0,0 +1,418 @@
 [
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 128000,
          "logprob": null,
          "text": "<|begin_of_text|>"
        },
        {
          "id": 3923,
          "logprob": -7.609375,
          "text": "What"
        },
        {
          "id": 374,
          "logprob": -0.92529297,
          "text": " is"
        },
        {
          "id": 5655,
          "logprob": -10.0,
          "text": " deep"
        },
        {
          "id": 6975,
          "logprob": -0.94628906,
          "text": " learning"
        },
        {
          "id": 30,
          "logprob": -2.9042969,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 18682,
          "logprob": -0.8769531,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 6975,
          "logprob": -0.0076942444,
          "special": false,
          "text": " learning"
        },
        {
          "id": 374,
          "logprob": -0.25146484,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
          "logprob": -0.097595215,
          "special": false,
          "text": " a"
        },
        {
          "id": 955,
          "logprob": -0.9248047,
          "special": false,
          "text": " type"
        },
        {
          "id": 315,
          "logprob": -0.00027513504,
          "special": false,
          "text": " of"
        },
        {
          "id": 21075,
          "logprob": -0.5527344,
          "special": false,
          "text": " artificial"
        },
        {
          "id": 11478,
          "logprob": -0.043151855,
          "special": false,
          "text": " intelligence"
        },
        {
          "id": 320,
          "logprob": -0.3840332,
          "special": false,
          "text": " ("
        },
        {
          "id": 15836,
          "logprob": -0.0011043549,
          "special": false,
          "text": "AI"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " Deep learning is a type of artificial intelligence (AI"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 128000,
          "logprob": null,
          "text": "<|begin_of_text|>"
        },
        {
          "id": 3923,
          "logprob": -7.6054688,
          "text": "What"
        },
        {
          "id": 374,
          "logprob": -0.92089844,
          "text": " is"
        },
        {
          "id": 5655,
          "logprob": -10.0,
          "text": " deep"
        },
        {
          "id": 6975,
          "logprob": -0.94433594,
          "text": " learning"
        },
        {
          "id": 30,
          "logprob": -2.90625,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 18682,
          "logprob": -0.875,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 6975,
          "logprob": -0.007698059,
          "special": false,
          "text": " learning"
        },
        {
          "id": 374,
          "logprob": -0.25268555,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
          "logprob": -0.09753418,
          "special": false,
          "text": " a"
        },
        {
          "id": 955,
          "logprob": -0.92529297,
          "special": false,
          "text": " type"
        },
        {
          "id": 315,
          "logprob": -0.00027942657,
          "special": false,
          "text": " of"
        },
        {
          "id": 21075,
          "logprob": -0.5527344,
          "special": false,
          "text": " artificial"
        },
        {
          "id": 11478,
          "logprob": -0.042541504,
          "special": false,
          "text": " intelligence"
        },
        {
          "id": 320,
          "logprob": -0.3840332,
          "special": false,
          "text": " ("
        },
        {
          "id": 15836,
          "logprob": -0.0011053085,
          "special": false,
          "text": "AI"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " Deep learning is a type of artificial intelligence (AI"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 128000,
          "logprob": null,
          "text": "<|begin_of_text|>"
        },
        {
          "id": 3923,
          "logprob": -7.6054688,
          "text": "What"
        },
        {
          "id": 374,
          "logprob": -0.92089844,
          "text": " is"
        },
        {
          "id": 5655,
          "logprob": -10.0,
          "text": " deep"
        },
        {
          "id": 6975,
          "logprob": -0.94433594,
          "text": " learning"
        },
        {
          "id": 30,
          "logprob": -2.90625,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 18682,
          "logprob": -0.875,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 6975,
          "logprob": -0.007698059,
          "special": false,
          "text": " learning"
        },
        {
          "id": 374,
          "logprob": -0.25268555,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
          "logprob": -0.09753418,
          "special": false,
          "text": " a"
        },
        {
          "id": 955,
          "logprob": -0.92529297,
          "special": false,
          "text": " type"
        },
        {
          "id": 315,
          "logprob": -0.00027942657,
          "special": false,
          "text": " of"
        },
        {
          "id": 21075,
          "logprob": -0.5527344,
          "special": false,
          "text": " artificial"
        },
        {
          "id": 11478,
          "logprob": -0.042541504,
          "special": false,
          "text": " intelligence"
        },
        {
          "id": 320,
          "logprob": -0.3840332,
          "special": false,
          "text": " ("
        },
        {
          "id": 15836,
          "logprob": -0.0011053085,
          "special": false,
          "text": "AI"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " Deep learning is a type of artificial intelligence (AI"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 128000,
          "logprob": null,
          "text": "<|begin_of_text|>"
        },
        {
          "id": 3923,
          "logprob": -7.6054688,
          "text": "What"
        },
        {
          "id": 374,
          "logprob": -0.92089844,
          "text": " is"
        },
        {
          "id": 5655,
          "logprob": -10.0,
          "text": " deep"
        },
        {
          "id": 6975,
          "logprob": -0.94433594,
          "text": " learning"
        },
        {
          "id": 30,
          "logprob": -2.90625,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 18682,
          "logprob": -0.875,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 6975,
          "logprob": -0.007698059,
          "special": false,
          "text": " learning"
        },
        {
          "id": 374,
          "logprob": -0.25268555,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
          "logprob": -0.09753418,
          "special": false,
          "text": " a"
        },
        {
          "id": 955,
          "logprob": -0.92529297,
          "special": false,
          "text": " type"
        },
        {
          "id": 315,
          "logprob": -0.00027942657,
          "special": false,
          "text": " of"
        },
        {
          "id": 21075,
          "logprob": -0.5527344,
          "special": false,
          "text": " artificial"
        },
        {
          "id": 11478,
          "logprob": -0.042541504,
          "special": false,
          "text": " intelligence"
        },
        {
          "id": 320,
          "logprob": -0.3840332,
          "special": false,
          "text": " ("
        },
        {
          "id": 15836,
          "logprob": -0.0011053085,
          "special": false,
          "text": "AI"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " Deep learning is a type of artificial intelligence (AI"
  }
 ]
--- a/integration-tests/models/snapshots/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16.json
@ -0,0 +1,104 @@
 {
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [
      {
        "id": 2,
        "logprob": null,
        "text": "<bos>"
      },
      {
        "id": 1841,
        "logprob": -5.46875,
        "text": "What"
      },
      {
        "id": 603,
        "logprob": -0.69140625,
        "text": " is"
      },
      {
        "id": 5271,
        "logprob": -12.0,
        "text": " deep"
      },
      {
        "id": 6044,
        "logprob": -0.32226562,
        "text": " learning"
      },
      {
        "id": 235336,
        "logprob": -0.33203125,
        "text": "?"
      }
    ],
    "seed": null,
    "tokens": [
      {
        "id": 109,
        "logprob": -0.24707031,
        "special": false,
        "text": "\n\n"
      },
      {
        "id": 26843,
        "logprob": -0.14550781,
        "special": false,
        "text": "Deep"
      },
      {
        "id": 6044,
        "logprob": -0.038330078,
        "special": false,
        "text": " learning"
      },
      {
        "id": 603,
        "logprob": -0.029907227,
        "special": false,
        "text": " is"
      },
      {
        "id": 476,
        "logprob": -0.020996094,
        "special": false,
        "text": " a"
      },
      {
        "id": 38397,
        "logprob": -0.828125,
        "special": false,
        "text": " subset"
      },
      {
        "id": 576,
        "logprob": -0.00049209595,
        "special": false,
        "text": " of"
      },
      {
        "id": 6479,
        "logprob": -0.057373047,
        "special": false,
        "text": " machine"
      },
      {
        "id": 6044,
        "logprob": -0.000207901,
        "special": false,
        "text": " learning"
      },
      {
        "id": 674,
        "logprob": -0.15429688,
        "special": false,
        "text": " that"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "\n\nDeep learning is a subset of machine learning that"
 }
--- a/integration-tests/models/snapshots/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_all_params.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_all_params.json
@ -0,0 +1,99 @@
 {
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [
      {
        "id": 2,
        "logprob": null,
        "text": "<bos>"
      },
      {
        "id": 1841,
        "logprob": -5.46875,
        "text": "What"
      },
      {
        "id": 603,
        "logprob": -0.69140625,
        "text": " is"
      },
      {
        "id": 5271,
        "logprob": -12.0,
        "text": " deep"
      },
      {
        "id": 6044,
        "logprob": -0.32226562,
        "text": " learning"
      }
    ],
    "seed": 0,
    "tokens": [
      {
        "id": 235336,
        "logprob": 0.0,
        "special": false,
        "text": "?"
      },
      {
        "id": 109,
        "logprob": 0.0,
        "special": false,
        "text": "\n\n"
      },
      {
        "id": 26843,
        "logprob": 0.0,
        "special": false,
        "text": "Deep"
      },
      {
        "id": 14715,
        "logprob": -0.38671875,
        "special": false,
        "text": " Learning"
      },
      {
        "id": 603,
        "logprob": 0.0,
        "special": false,
        "text": " is"
      },
      {
        "id": 476,
        "logprob": 0.0,
        "special": false,
        "text": " a"
      },
      {
        "id": 38397,
        "logprob": -0.12695312,
        "special": false,
        "text": " subset"
      },
      {
        "id": 576,
        "logprob": 0.0,
        "special": false,
        "text": " of"
      },
      {
        "id": 6479,
        "logprob": 0.0,
        "special": false,
        "text": " machine"
      },
      {
        "id": 6044,
        "logprob": 0.0,
        "special": false,
        "text": " learning"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "What is deep learning?\n\nDeep Learning is a subset of machine learning"
 }
--- a/integration-tests/models/snapshots/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_load.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_load.json
@ -0,0 +1,418 @@
 [
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 2,
          "logprob": null,
          "text": "<bos>"
        },
        {
          "id": 1841,
          "logprob": -5.46875,
          "text": "What"
        },
        {
          "id": 603,
          "logprob": -0.69140625,
          "text": " is"
        },
        {
          "id": 5271,
          "logprob": -12.0,
          "text": " deep"
        },
        {
          "id": 6044,
          "logprob": -0.32226562,
          "text": " learning"
        },
        {
          "id": 235336,
          "logprob": -0.33203125,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 109,
          "logprob": -0.24707031,
          "special": false,
          "text": "\n\n"
        },
        {
          "id": 26843,
          "logprob": -0.14550781,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 6044,
          "logprob": -0.03857422,
          "special": false,
          "text": " learning"
        },
        {
          "id": 603,
          "logprob": -0.030883789,
          "special": false,
          "text": " is"
        },
        {
          "id": 476,
          "logprob": -0.020996094,
          "special": false,
          "text": " a"
        },
        {
          "id": 38397,
          "logprob": -0.828125,
          "special": false,
          "text": " subset"
        },
        {
          "id": 576,
          "logprob": -0.00051498413,
          "special": false,
          "text": " of"
        },
        {
          "id": 6479,
          "logprob": -0.05883789,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6044,
          "logprob": -0.00020694733,
          "special": false,
          "text": " learning"
        },
        {
          "id": 674,
          "logprob": -0.15820312,
          "special": false,
          "text": " that"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nDeep learning is a subset of machine learning that"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 2,
          "logprob": null,
          "text": "<bos>"
        },
        {
          "id": 1841,
          "logprob": -5.46875,
          "text": "What"
        },
        {
          "id": 603,
          "logprob": -0.71484375,
          "text": " is"
        },
        {
          "id": 5271,
          "logprob": -12.0,
          "text": " deep"
        },
        {
          "id": 6044,
          "logprob": -0.30859375,
          "text": " learning"
        },
        {
          "id": 235336,
          "logprob": -0.3359375,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 109,
          "logprob": -0.23828125,
          "special": false,
          "text": "\n\n"
        },
        {
          "id": 26843,
          "logprob": -0.14550781,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 6044,
          "logprob": -0.038330078,
          "special": false,
          "text": " learning"
        },
        {
          "id": 603,
          "logprob": -0.030883789,
          "special": false,
          "text": " is"
        },
        {
          "id": 476,
          "logprob": -0.020996094,
          "special": false,
          "text": " a"
        },
        {
          "id": 38397,
          "logprob": -0.80859375,
          "special": false,
          "text": " subset"
        },
        {
          "id": 576,
          "logprob": -0.0005455017,
          "special": false,
          "text": " of"
        },
        {
          "id": 6479,
          "logprob": -0.05908203,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6044,
          "logprob": -0.00020599365,
          "special": false,
          "text": " learning"
        },
        {
          "id": 674,
          "logprob": -0.17285156,
          "special": false,
          "text": " that"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nDeep learning is a subset of machine learning that"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 2,
          "logprob": null,
          "text": "<bos>"
        },
        {
          "id": 1841,
          "logprob": -5.46875,
          "text": "What"
        },
        {
          "id": 603,
          "logprob": -0.71484375,
          "text": " is"
        },
        {
          "id": 5271,
          "logprob": -12.0,
          "text": " deep"
        },
        {
          "id": 6044,
          "logprob": -0.30859375,
          "text": " learning"
        },
        {
          "id": 235336,
          "logprob": -0.3359375,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 109,
          "logprob": -0.23828125,
          "special": false,
          "text": "\n\n"
        },
        {
          "id": 26843,
          "logprob": -0.14550781,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 6044,
          "logprob": -0.038330078,
          "special": false,
          "text": " learning"
        },
        {
          "id": 603,
          "logprob": -0.030883789,
          "special": false,
          "text": " is"
        },
        {
          "id": 476,
          "logprob": -0.020996094,
          "special": false,
          "text": " a"
        },
        {
          "id": 38397,
          "logprob": -0.80859375,
          "special": false,
          "text": " subset"
        },
        {
          "id": 576,
          "logprob": -0.0005455017,
          "special": false,
          "text": " of"
        },
        {
          "id": 6479,
          "logprob": -0.05908203,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6044,
          "logprob": -0.00020599365,
          "special": false,
          "text": " learning"
        },
        {
          "id": 674,
          "logprob": -0.17285156,
          "special": false,
          "text": " that"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nDeep learning is a subset of machine learning that"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 2,
          "logprob": null,
          "text": "<bos>"
        },
        {
          "id": 1841,
          "logprob": -5.46875,
          "text": "What"
        },
        {
          "id": 603,
          "logprob": -0.71484375,
          "text": " is"
        },
        {
          "id": 5271,
          "logprob": -12.0,
          "text": " deep"
        },
        {
          "id": 6044,
          "logprob": -0.30859375,
          "text": " learning"
        },
        {
          "id": 235336,
          "logprob": -0.3359375,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 109,
          "logprob": -0.23828125,
          "special": false,
          "text": "\n\n"
        },
        {
          "id": 26843,
          "logprob": -0.14550781,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 6044,
          "logprob": -0.038330078,
          "special": false,
          "text": " learning"
        },
        {
          "id": 603,
          "logprob": -0.030883789,
          "special": false,
          "text": " is"
        },
        {
          "id": 476,
          "logprob": -0.020996094,
          "special": false,
          "text": " a"
        },
        {
          "id": 38397,
          "logprob": -0.80859375,
          "special": false,
          "text": " subset"
        },
        {
          "id": 576,
          "logprob": -0.0005455017,
          "special": false,
          "text": " of"
        },
        {
          "id": 6479,
          "logprob": -0.05908203,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6044,
          "logprob": -0.00020599365,
          "special": false,
          "text": " learning"
        },
        {
          "id": 674,
          "logprob": -0.17285156,
          "special": false,
          "text": " that"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nDeep learning is a subset of machine learning that"
  }
 ]
--- a/integration-tests/models/test_compressed_tensors_w8an_fp.py
+++ b/integration-tests/models/test_compressed_tensors_w8an_fp.py
@ -0,0 +1,86 @@
 import pytest
@pytest.fixture(scope="module")
 def compressed_tensors_w8an_handle(launcher):
    with launcher(
        "neuralmagic/Llama-3.2-1B-Instruct-FP8",
        num_shard=2,
        quantize="compressed-tensors",
    ) as handle:
        yield handle
@pytest.fixture(scope="module")
 async def compressed_tensors_w8an(compressed_tensors_w8an_handle):
    await compressed_tensors_w8an_handle.health(300)
    return compressed_tensors_w8an_handle.client
@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
 async def test_compressed_tensors_w8an(compressed_tensors_w8an, response_snapshot):
    response = await compressed_tensors_w8an.generate(
        "What is deep learning?",
        max_new_tokens=10,
        decoder_input_details=True,
    )
    assert (
        response.generated_text
        == " Deep learning is a type of artificial intelligence (AI"
    )
    assert response.details.generated_tokens == 10
    assert response == response_snapshot
@pytest.mark.asyncio
 async def test_compressed_tensors_w8an_all_params(
    compressed_tensors_w8an, response_snapshot
 ):
    response = await compressed_tensors_w8an.generate(
        "What is deep learning",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        stop_sequences=["test"],
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )
    assert response.details.generated_tokens == 10
    assert (
        response.generated_text
        == "What is deep learning?\nDeep learning, also known as neural network or"
    )
    assert response == response_snapshot
@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
 async def test_compressed_tensors_w8an_load(
    compressed_tensors_w8an, generate_load, response_snapshot
 ):
    responses = await generate_load(
        compressed_tensors_w8an,
        "What is deep learning?",
        max_new_tokens=10,
        n=4,
    )
    assert (
        responses[0].generated_text
        == " Deep learning is a type of artificial intelligence (AI"
    )
    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])
    assert responses == response_snapshot
--- a/integration-tests/models/test_compressed_tensors_wna16_int.py
+++ b/integration-tests/models/test_compressed_tensors_wna16_int.py
@ -0,0 +1,86 @@
 import pytest
@pytest.fixture(scope="module")
 def compressed_tensors_wna16_handle(launcher):
    with launcher(
        "neuralmagic/gemma-2-2b-it-quantized.w4a16",
        num_shard=2,
        quantize="compressed-tensors",
    ) as handle:
        yield handle
@pytest.fixture(scope="module")
 async def compressed_tensors_wna16(compressed_tensors_wna16_handle):
    await compressed_tensors_wna16_handle.health(300)
    return compressed_tensors_wna16_handle.client
@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
 async def test_compressed_tensors_wna16(compressed_tensors_wna16, response_snapshot):
    response = await compressed_tensors_wna16.generate(
        "What is deep learning?",
        max_new_tokens=10,
        decoder_input_details=True,
    )
    assert (
        response.generated_text
        == "\n\nDeep learning is a subset of machine learning that"
    )
    assert response.details.generated_tokens == 10
    assert response == response_snapshot
@pytest.mark.asyncio
 async def test_compressed_tensors_wna16_all_params(
    compressed_tensors_wna16, response_snapshot
 ):
    response = await compressed_tensors_wna16.generate(
        "What is deep learning",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        stop_sequences=["test"],
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )
    assert response.details.generated_tokens == 10
    assert (
        response.generated_text
        == "What is deep learning?\n\nDeep Learning is a subset of machine learning"
    )
    assert response == response_snapshot
@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
 async def test_compressed_tensors_wna16_load(
    compressed_tensors_wna16, generate_load, response_snapshot
 ):
    responses = await generate_load(
        compressed_tensors_wna16,
        "What is deep learning?",
        max_new_tokens=10,
        n=4,
    )
    assert (
        responses[0].generated_text
        == "\n\nDeep learning is a subset of machine learning that"
    )
    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])
    assert responses == response_snapshot
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@ -212,6 +212,8 @@ enum Quantization {
    ///   <https://hf.co/models?search=awq>.
    /// Should replace GPTQ models wherever possible because of the better latency
    Awq,
    /// Compressed tensors, which can be a mixture of different quantization methods.
    CompressedTensors,
    /// 8 bit quantization, doesn't require specific model.
    /// Should be a drop-in replacement to bitsandbytes with much better performance.
    /// Kernels are from <https://github.com/NetEase-FuXi/EETQ.git>
@ -274,6 +276,9 @@ impl std::fmt::Display for Quantization {
            Quantization::Awq => {
                write!(f, "awq")
            }
            Quantization::CompressedTensors => {
                write!(f, "compressed-tensors")
            }
            Quantization::Eetq => {
                write!(f, "eetq")
            }
--- a/nix/server.nix
+++ b/nix/server.nix
@ -5,6 +5,7 @@
  mypy-protobuf,
  awq-inference-engine,
  causal-conv1d,
  compressed-tensors,
  eetq,
  einops,
  exllamav2,
@ -74,6 +75,7 @@ buildPythonPackage {
    awq-inference-engine
    eetq
    causal-conv1d
    compressed-tensors
    einops
    exllamav2
    flashinfer
--- a/server/Makefile
+++ b/server/Makefile
@ -23,7 +23,7 @@ gen-server:
 install-server: gen-server
 	pip install pip --upgrade
 	pip install -r requirements_cuda.txt
-	pip install -e ".[accelerate, quantize, peft, outlines]"
+	pip install -e ".[accelerate, compressed-tensors, quantize, peft, outlines]"
 install: install-cuda
--- a/server/poetry.lock
+++ b/server/poetry.lock
@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
 [[package]]
 name = "accelerate"
@ -388,6 +388,26 @@ files = [
    {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
 ]
 [[package]]
 name = "compressed-tensors"
 version = "0.7.1"
 description = "Library for utilization of compressed safetensors of neural network models"
 optional = true
 python-versions = "*"
 files = [
    {file = "compressed-tensors-0.7.1.tar.gz", hash = "sha256:3c7865ebfe4ea76ae94d7c674bcf93aedd2064571f682c09a377a219d5ebb3a0"},
    {file = "compressed_tensors-0.7.1-py3-none-any.whl", hash = "sha256:22d11558a70f655ae647db9c8e9fb14a5e9d6983ca5aec3f267518625fd6dd0e"},
 ]
 [package.dependencies]
 pydantic = ">=2.0"
 torch = ">=1.7.0"
 transformers = "*"
 [package.extras]
 accelerate = ["accelerate"]
 dev = ["black (==22.12.0)", "flake8 (>=3.8.3)", "isort (==5.8.0)", "nbconvert (>=7.16.3)", "pytest (>=6.0.0)", "wheel (>=0.36.2)"]
 [[package]]
 name = "datasets"
 version = "2.21.0"
@ -3982,4 +4002,4 @@ torch = ["torch"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.13"
-content-hash = "b39033e573f50a0f046787aebf1702d86673aad0b2fcee818404fcea7f644b81"
+content-hash = "4636689efd4c94559c3c23903aafcffd177533a3b9006b3b4f8491b158a3a754"
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@ -37,6 +37,7 @@ pillow = "^10.0.0"
 outlines= { version = "^0.0.34", optional = true }
 prometheus-client = "^0.20.0"
 py-cpuinfo = "^9.0.0"
 compressed-tensors = { version = "^0.7.1", optional = true }
 # Remove later, temporary workaround for outlines.
 numpy = "^1.26"
@ -58,6 +59,7 @@ rich = "^13.7.1"
 torch = ["torch"]
 accelerate = ["accelerate"]
 bnb = ["bitsandbytes"]
 compressed-tensors = ["compressed-tensors"]
 marlin = ["marlin-kernels"]
 moe = ["moe-kernels"]
 peft = ["peft"]
--- a/server/text_generation_server/cli.py
+++ b/server/text_generation_server/cli.py
@ -19,6 +19,7 @@ class Quantization(str, Enum):
    bitsandbytes_fp4 = "bitsandbytes-fp4"
    gptq = "gptq"
    awq = "awq"
    compressed_tensors = "compressed-tensors"
    eetq = "eetq"
    exl2 = "exl2"
    fp8 = "fp8"
--- a/server/text_generation_server/layers/compressed_tensors/init.py
+++ b/server/text_generation_server/layers/compressed_tensors/init.py
@ -0,0 +1,3 @@
 from .loader import CompressedTensorsLoader
 __all__ = ["CompressedTensorsLoader"]
--- a/server/text_generation_server/layers/compressed_tensors/loader.py
+++ b/server/text_generation_server/layers/compressed_tensors/loader.py
@ -0,0 +1,174 @@
 from typing import Any, Dict, List, Union
 from compressed_tensors import QuantizationConfig, QuantizationStatus
 from compressed_tensors.config import CompressionFormat
 from compressed_tensors.quantization import (
    QuantizationScheme,
    QuantizationType,
    find_name_or_class_matches,
 )
 from loguru import logger
 from pydantic import ValidationError
 from torch import nn
 from text_generation_server.layers.compressed_tensors.w8an_fp import W8ANFpLoader
 from text_generation_server.layers.compressed_tensors.wna16_int import WNA16Loader
 from text_generation_server.utils.log import log_once
 from text_generation_server.utils.weights import (
    DefaultWeightsLoader,
    UnquantizedWeight,
    Weights,
    WeightsLoader,
 )
 # compressed-tensors can match modules as quantization targets. However,
 # they need to be objects rather than classes or class names. Since we
 # need to match `Linear` targets, make an instance that can be re-used.
 _EMPTY_LINEAR: nn.Module = nn.Linear(0, 0)
 class CompressedTensorsLoader(WeightsLoader):
    """Loader for checkpoints stored in the compressed-tensors format."""
    def __init__(self, config: Dict[str, Any]):
        quantization_config_raw = config.get("quantization_config")
        if quantization_config_raw is None:
            # `compression_config` was renamed to `quantization_config`; support
            # retained for backward compatibility.
            quantization_config_raw = config.get("compression_config")
        if quantization_config_raw is None:
            raise ValueError(
                "Checkpoint does not have compressed-tensors configuration"
            )
        try:
            quantization_config = QuantizationConfig.model_validate(
                quantization_config_raw
            )
        except ValidationError as e:
            raise ValueError("Cannot parse compressed-tensors configuration") from e
        if quantization_config.quantization_status not in (
            QuantizationStatus.COMPRESSED,
            QuantizationStatus.FROZEN,
        ):
            raise ValueError(
                f"Model quantization was not finished, status was: {quantization_config.quantization_status}"
            )
        self.ignore = (
            quantization_config.ignore if quantization_config.ignore is not None else []
        )
        self.loaders = self._get_target_loaders(quantization_config)
        for target, loader in self.loaders.items():
            log_once(
                logger.info,
                f"Using {loader} for compressed-tensors target '{target}'",
            )
    def get_weights(self, weights: Weights, prefix: str):
        loader = self._lookup_loader(prefix)
        return loader.get_weights(weights, prefix)
    def get_weights_col_packed(
        self,
        weights: "Weights",
        prefix: str,
        block_sizes: Union[int, List[int]],
    ):
        loader = self._lookup_loader(prefix)
        return loader.get_weights_col_packed(weights, prefix, block_sizes)
    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
        loader = self._lookup_loader(prefixes[0])
        return loader.get_multi_weights_col(weights, prefixes, dim)
    def get_weights_row(self, weights: Weights, prefix: str):
        loader = self._lookup_loader(prefix)
        return loader.get_weights_row(weights, prefix)
    def _get_target_loaders(
        self, quantization_config: QuantizationConfig
    ) -> Dict[str, WeightsLoader]:
        """
        A compressed-tensors checkpoint can use different quantizations
        for different targets. This method returns a dictionary with a
        loader per target.
        """
        loaders: Dict[str, WeightsLoader] = {}
        format = quantization_config.format
        for group_name, group in quantization_config.config_groups.items():
            # The group configuration can be a string, but does that ever
            # happen in a serialized quantization config?
            assert isinstance(group, QuantizationScheme)
            loader = self._create_loader_for_group(format, group_name, group)
            # A quantized parameter group can have multiple targets, add the
            # loader for all the targets.
            for target in group.targets:
                if target in loaders:
                    raise ValueError(
                        f"Target '{target} has multiple configured loaders'"
                    )
                loaders[target] = loader
        return loaders
    def _create_loader_for_group(
        self, format: str, group_name: str, group: QuantizationScheme
    ) -> WeightsLoader:
        """
        Find and create a loader for the group with the given quantization
        scheme.
        """
        # NOTE: we ignore group.output_activations because we don't support
        #       output quantization yet.
        input_activations = group.input_activations
        weights = group.weights
        if (
            format
            in {
                CompressionFormat.float_quantized.value,
                CompressionFormat.naive_quantized.value,
            }
            and weights is not None
            and weights.type == QuantizationType.FLOAT
            and weights.num_bits == 8
        ):
            # FP W8A8 or W8A16.
            return W8ANFpLoader(input_activations=input_activations, weights=weights)
        elif (
            format == CompressionFormat.pack_quantized.value
            and weights is not None
            and weights.type == QuantizationType.INT
            and weights.num_bits in (4, 8)
        ):
            # INT W4A16 or W8A16 (GPTQ/AWQ-like).
            return WNA16Loader(weights)
        else:
            raise ValueError(
                f"Group '{group_name}' has unsupported compressed-tensors configurtion"
            )
    def _lookup_loader(self, prefix: str) -> WeightsLoader:
        """
        Look up the loader to use for a given parameter name (prefix).
        """
        if len(find_name_or_class_matches(prefix, _EMPTY_LINEAR, self.ignore)) > 0:
            return DefaultWeightsLoader(UnquantizedWeight)
        # We currently only handle linear layers, so unconditionally pass
        # a `Linear` instance.
        targets = find_name_or_class_matches(prefix, _EMPTY_LINEAR, self.loaders.keys())
        if len(targets) == 0:
            raise ValueError(
                f"Cannot find compressed-tensors target for prefix: {prefix}"
            )
        return self.loaders[targets[0]]
--- a/server/text_generation_server/layers/compressed_tensors/w8an_fp.py
+++ b/server/text_generation_server/layers/compressed_tensors/w8an_fp.py
@ -0,0 +1,174 @@
 from typing import List, Optional, Union
 import torch
 from compressed_tensors.quantization import QuantizationArgs, QuantizationType
 from text_generation_server.layers.fp8 import Fp8Weight, _load_scalar_or_matrix_scale
 from text_generation_server.utils.weights import Weights, WeightsLoader
 class W8ANFpLoader(WeightsLoader):
    """
    Loader for W8A8/W8A16 FP compressed-tensors parameters.
    """
    def __init__(
        self,
        *,
        input_activations: Optional[QuantizationArgs],
        weights: QuantizationArgs,
    ):
        assert weights.type == QuantizationType.FLOAT and weights.num_bits == 8
        # We ignore the `strategy` option which sets the scales to be
        # per-tensor, per-channel or per-token. What scales are supported
        # is dependent on the kernels used (e.g. cutlass can do tokenwise,
        # Torch cannot, and FP8-Marlin does not quantize inputs at all).
        # So, instead we try to use the best-possible configuration.
        self.load_weight_scale = not weights.dynamic
        self.load_input_scale = (
            input_activations is not None and not input_activations.dynamic
        )
        self.force_w8a16 = (
            input_activations is not None and input_activations.num_bits == 16
        )
    def __str__(self) -> str:
        def scale_to_str(scale):
            return "static" if scale else "dynamic"
        quantization_type = f"W8A{16 if self.force_w8a16 else 8}"
        return f"{self.__class__.__name__} ({quantization_type}, weight: {scale_to_str(self.load_weight_scale)}, input: {scale_to_str(self.load_input_scale)})"
    def get_weights(self, weights: "Weights", prefix: str):
        w = weights.get_tensor(f"{prefix}.weight")
        weight_scale = None
        if self.load_weight_scale:
            weight_scale = (
                weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
                .reshape(-1)
                .expand(w.shape[0])
            )
        input_scale = None
        if self.load_input_scale:
            input_scale = weights.get_tensor(
                f"{prefix}.input_scale", to_dtype=False
            ).reshape(-1)
        return Fp8Weight(
            weight=w,
            weight_scale=weight_scale,
            input_scale=input_scale,
            dtype=weights.dtype,
            force_w8a16=self.force_w8a16,
        )
    def get_weights_col_packed(
        self,
        weights: Weights,
        prefix: str,
        block_sizes: Union[int, List[int]],
    ):
        w = weights.get_packed_sharded(
            f"{prefix}.weight", dim=0, block_sizes=block_sizes
        )
        weight_scale = None
        if self.load_weight_scale:
            weight_scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
            if weight_scale.numel() > 1:
                weight_scale = weights.get_packed_sharded(
                    f"{prefix}.weight_scale",
                    dim=0,
                    block_sizes=block_sizes,
                    to_dtype=False,
                )
            weight_scale = weight_scale.reshape(-1).expand(w.shape[0])
        input_scale = None
        if self.load_input_scale:
            input_scale = weights.get_tensor(f"{prefix}.input_scale", to_dtype=False)
            if input_scale.numel() > 1:
                input_scale = weights.get_packed_sharded(
                    f"{prefix}.input_scale",
                    dim=0,
                    block_sizes=block_sizes,
                    to_dtype=False,
                )
            input_scale = input_scale.reshape(-1).max()
        return Fp8Weight(
            weight=w,
            weight_scale=weight_scale,
            input_scale=input_scale,
            dtype=weights.dtype,
            force_w8a16=self.force_w8a16,
        )
    def get_multi_weights_col(self, weights: "Weights", prefixes: List[str], dim: int):
        # FIXME: Force to_device to false as fp8 weights do not support torch.cat on device yet
        w = [
            weights.get_sharded(f"{p}.weight", dim=0, to_device=False) for p in prefixes
        ]
        shapes = [x.shape for x in w]
        # Concat then send to the device
        w = torch.cat(w, dim=dim).to(weights.device)
        weight_scale = None
        if self.load_weight_scale:
            weight_scale = [
                _load_scalar_or_matrix_scale(weights, f"{p}.weight_scale", shape)
                for p, shape in zip(prefixes, shapes)
            ]
            weight_scale = torch.cat(weight_scale, dim=0).reshape(-1)
        input_scale = None
        if self.load_input_scale:
            input_scale = [
                _load_scalar_or_matrix_scale(weights, f"{p}.input_scale", shape)
                for p, shape in zip(prefixes, shapes)
                if weights.has_tensor(f"{p}.input_scale")
            ]
            assert len(input_scale) == 0 or len(input_scale) == len(prefixes)
            input_scale = (
                torch.cat(input_scale, dim=0).reshape(-1).max()
                if len(input_scale) != 0
                else None
            )
        return Fp8Weight(
            weight=w,
            weight_scale=weight_scale,
            input_scale=input_scale,
            dtype=weights.dtype,
            force_w8a16=self.force_w8a16,
        )
    def get_weights_row(self, weights: "Weights", prefix: str):
        w = weights.get_sharded(f"{prefix}.weight", dim=1)
        weight_scale = None
        if self.load_weight_scale:
            weight_scale = (
                weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
                .reshape(-1)
                .expand(w.shape[0])
            )
        input_scale = None
        if self.load_input_scale:
            input_scale = weights.get_tensor(
                f"{prefix}.input_scale", to_dtype=False
            ).reshape(-1)
        return Fp8Weight(
            weight=w,
            weight_scale=weight_scale,
            input_scale=input_scale,
            dtype=weights.dtype,
            force_w8a16=self.force_w8a16,
        )
--- a/server/text_generation_server/layers/compressed_tensors/wna16_int.py
+++ b/server/text_generation_server/layers/compressed_tensors/wna16_int.py
@ -0,0 +1,188 @@
 from typing import List, Union
 import torch
 from compressed_tensors.quantization import ActivationOrdering, QuantizationArgs
 from loguru import logger
 from text_generation_server.layers.marlin.gptq import repack_gptq_for_marlin
 from text_generation_server.utils.log import log_once
 from text_generation_server.utils.weights import Weights, WeightsLoader
 class WNA16Loader(WeightsLoader):
    """
    Loader for W4A16/W8A16 INT compressed-tensors parameters.
    """
    def __init__(self, weights: QuantizationArgs):
        self.weights = weights
        self.desc_act = self.weights.actorder == ActivationOrdering.GROUP
        self.groupsize = (
            -1 if self.weights.group_size is None else self.weights.group_size
        )
    def __str__(self) -> str:
        quantization_type = f"W{self.weights.num_bits}8A16"
        return f"{self.__class__.__name__} ({quantization_type})"
    def get_weights(self, weights: Weights, prefix: str):
        log_once(logger.info, "Using GPTQ-Marlin kernels")
        try:
            weight_packed = weights.get_tensor(f"{prefix}.weight_packed").t()
        except RuntimeError:
            raise RuntimeError(
                f"Cannot load w{self.weights.num_bits}a16 weight, make sure the model is already quantized"
            )
        zero_point = None
        if not self.weights.symmetric:
            zero_point = weights.get_tensor(f"{prefix}.weight_zero_point").t()
        g_idx = None
        if self.desc_act:
            g_idx = weights.get_tensor(f"{prefix}.weight_g_idx")
        scales = weights.get_tensor(f"{prefix}.weight.scales").t()
        return repack_gptq_for_marlin(
            qweight=weight_packed.contiguous(),
            scales=scales,
            qzeros=zero_point,
            g_idx=g_idx,
            bits=self.weights.num_bits,
            desc_act=self.desc_act,
            groupsize=self.groupsize,
            quant_method="compressed-tensors",
            sym=self.weights.symmetric,
            sharded_infeatures=False,
        )
    def get_weights_col_packed(
        self,
        weights: Weights,
        prefix: str,
        block_sizes: Union[int, List[int]],
    ):
        try:
            weight_packed = weights.get_packed_sharded(
                f"{prefix}.weight_packed", dim=0, block_sizes=block_sizes
            ).t()
        except RuntimeError:
            raise RuntimeError(
                f"Cannot load w{self.weights.num_bits}a16 weight, make sure the model is already quantized"
            )
        scales = weights.get_packed_sharded(
            f"{prefix}.weight_scale", dim=0, block_sizes=block_sizes
        ).t()
        scales = scales.to(dtype=weights.dtype)
        zero_point = None
        if not self.weights.symmetric:
            zero_point = weights.get_packed_sharded(
                f"{prefix}.qzeros", dim=0, block_sizes=block_sizes
            ).t()
        g_idx = None
        if self.desc_act:
            g_idx = weights.get_tensor(f"{prefix}.g_idx")
        return repack_gptq_for_marlin(
            qweight=weight_packed.contiguous(),
            scales=scales,
            qzeros=zero_point,
            g_idx=g_idx,
            bits=self.weights.num_bits,
            desc_act=self.desc_act,
            groupsize=self.groupsize,
            quant_method="compressed-tensors",
            sym=self.weights.symmetric,
            sharded_infeatures=False,
        )
    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
        try:
            weight_packed = torch.cat(
                [
                    weights.get_sharded(f"{p}.weight_packed", dim=0).t()
                    for p in prefixes
                ],
                dim=1,
            )
        except RuntimeError:
            raise RuntimeError(
                f"Cannot load w{self.weights.num_bits}a16 weight, make sure the model is already quantized"
            )
        scales = torch.cat(
            [weights.get_sharded(f"{p}.weight_scale", dim=0).t() for p in prefixes],
            dim=1,
        )
        zero_point = None
        if not self.weights.symmetric:
            zero_point = torch.cat(
                [weights.get_sharded(f"{p}.qzeros", dim=0).t() for p in prefixes], dim=1
            ).t()
        g_idx = None
        if self.desc_act:
            w = [weights.get_tensor(f"{p}.g_idx") for p in prefixes]
            for w2 in w[1:]:
                torch.testing.assert_close(w2, w[0])
            g_idx = w[0]
        return repack_gptq_for_marlin(
            qweight=weight_packed.contiguous(),
            scales=scales,
            qzeros=zero_point,
            g_idx=g_idx,
            bits=self.weights.num_bits,
            desc_act=self.desc_act,
            groupsize=self.groupsize,
            quant_method="compressed-tensors",
            sym=self.weights.symmetric,
            sharded_infeatures=False,
        )
    def get_weights_row(self, weights: Weights, prefix: str):
        log_once(logger.info, "Using GPTQ-Marlin kernels")
        try:
            weight_packed = weights.get_sharded(f"{prefix}.weight_packed", dim=1).t()
        except RuntimeError:
            raise RuntimeError(
                f"Cannot load `{self.quantize}` weight, make sure the model is already quantized."
            )
        zero_point = None
        if not self.weights.symmetric:
            if self.desc_act or self.groupsize == -1:
                zero_point = weights.get_tensor(f"{prefix}.weight_zero_point").t()
            else:
                zero_point = weights.get_sharded(
                    f"{prefix}.weight_zero_point", dim=1
                ).t()
        g_idx = None
        if self.desc_act:
            g_idx = weights.get_sharded(f"{prefix}.g_idx", dim=0)
        if self.desc_act or self.groupsize == -1:
            scales = weights.get_tensor(f"{prefix}.weight_scale").t()
        else:
            scales = weights.get_sharded(f"{prefix}.weight_scale", dim=1).t()
        sharded_in_features = weights.process_group.size() > 1
        return repack_gptq_for_marlin(
            qweight=weight_packed.contiguous(),
            scales=scales,
            qzeros=zero_point,
            g_idx=g_idx,
            bits=self.weights.num_bits,
            desc_act=self.desc_act,
            groupsize=self.groupsize,
            quant_method="compressed-tensors",
            sym=self.weights.symmetric,
            sharded_infeatures=sharded_in_features,
        )
--- a/server/text_generation_server/layers/fp8.py
+++ b/server/text_generation_server/layers/fp8.py
@ -29,7 +29,7 @@ else:
    CUTLASS_FP8_AVAILABLE = False
-def get_fp8_linear() -> Type[torch.nn.Module]:
+def get_fp8_linear(force_w8a16: bool = False) -> Type[torch.nn.Module]:
    """
    Return an FP8 linear `Module` that is compatible with the current system.
    """
@ -37,7 +37,14 @@ def get_fp8_linear() -> Type[torch.nn.Module]:
    if SYSTEM == "cuda":
        major, _ = torch.cuda.get_device_capability()
-        if major == 8 and os.getenv("USE_CUTLASS_W8A8", "0") != "1":
+        # Marlin is W8A16, use it when:
        #
        # - On capability 8.x where x < 8: W8A8 FP8 GEMM is not supported.
        # - On capability 8.9: W8A8 FP8 GEMM is supported, but Marlin-FP8 is faster.
        # - On capability 9.x when force_w8a16: cutlass kernels do not support W8A16.
        if (major == 8 or (major == 9 and force_w8a16)) and os.getenv(
            "USE_CUTLASS_W8A8", "0"
        ) != "1":
            # NOTE: Capability 8.9 is supported by cutlass kernels, but FP8-Marlin
            #       gives better decoding throughput on L4 and L40.
            from text_generation_server.layers.marlin import GPTQMarlinFP8Linear
@ -283,14 +290,17 @@ class Fp8Weight(Weight):
    weight_scale: Optional[torch.Tensor] = None
    input_scale: Optional[torch.Tensor] = None
    activation_scale_ub: Optional[float] = None
    force_w8a16: bool = False
    def get_linear(self, bias: torch.Tensor):
        if self.weight_scale is None:
-            return get_fp8_linear().from_unquant(self.weight, bias, self.dtype)
+            return get_fp8_linear(force_w8a16=self.force_w8a16).from_unquant(
                self.weight, bias, self.dtype
            )
        # This is not checked by the fbgemm kernels, but they require contiguous
        # memory. Can be non-contiguous when we e.g. expand from scalars.
        self.weight_scale = self.weight_scale.contiguous()
-        return get_fp8_linear().from_fp8(
+        return get_fp8_linear(force_w8a16=self.force_w8a16).from_fp8(
            weight=self.weight,
            scale=self.weight_scale,
            dtype=self.dtype,
--- a/server/text_generation_server/layers/marlin/gptq.py
+++ b/server/text_generation_server/layers/marlin/gptq.py
@ -261,7 +261,7 @@ class GPTQMarlinWeight(Weight):
    def __post_init__(self):
        assert self.qweight.dtype == torch.int32
-        assert self.scales.dtype == torch.float16
+        assert self.scales.dtype in (torch.float16, torch.bfloat16)
        assert self.g_idx.dtype == torch.int32
        assert self.perm.dtype == torch.int32
@ -300,7 +300,7 @@ def repack_gptq_for_marlin(
        raise RuntimeError(
            f"Repacking GPTQ weights with group size {groupsize} as Marlin is not supported, must be one of: {supported_sizes}"
        )
-    if not (sym or quant_method == "awq"):
+    if not (sym or quant_method == "awq" or quant_method == "compressed-tensors"):
        raise RuntimeError(
            "Repacking GPTQ weights with asymmetric quantization as Marlin is not supported."
        )
--- a/server/text_generation_server/models/init.py
+++ b/server/text_generation_server/models/init.py
@ -370,46 +370,23 @@ def get_model(
    compression_config = config_dict.get("compression_config", None)
    if quantization_config is not None and quantize is None:
        method = quantization_config.get("quant_method", None)
        config_groups = quantization_config.get("config_groups", None)
        if method in {"gptq", "awq", "exl2"}:
            log_master(logger.info, f"Auto selecting quantization method {method}")
            quantize = method
        elif method == "fbgemm_fp8" or method == "fp8":
            log_master(logger.info, "Auto selecting quantization method fp8")
            quantize = "fp8"
-        elif config_groups is not None:
+        if method == "compressed-tensors":
            # TODO: at some point we should probably fully parse the compression
            # configuration to know which parameters are compressed.
            for _, group in config_groups.items():
                weights_config = group.get("weights")
                if weights_config is not None:
                    if (
                        weights_config["type"] == "float"
                        and weights_config["num_bits"] == 8
                    ):
            log_master(
-                            logger.info, "Auto selecting quantization method fp8"
+                logger.info, "Auto selecting quantization method compressed-tensors"
            )
-                        quantize = "fp8"
+            quantize = "compressed-tensors"
                        break
        else:
            log_master(logger.warning, f"Unknown quantization method {method}")
    elif compression_config is not None:
        # `compression_config` renamed to `quantization_config`; support retained for backward compatibility.
-        config_groups = compression_config.get("config_groups")
+        log_master(logger.info, "Auto selecting quantization method compressed-tensors")
-        if config_groups is not None:
+        quantize = "compressed-tensors"
            for _, group in config_groups.items():
                weights_config = group.get("weights")
                if weights_config is not None:
                    if (
                        weights_config["type"] == "float"
                        and weights_config["num_bits"] == 8
                    ):
                        log_master(
                            logger.info, "Auto selecting quantization method fp8"
                        )
                        quantize = "fp8"
                        break
    if dtype is None:
        if quantize in ["awq", "exl2", "gptq", "marlin"]:
--- a/server/text_generation_server/utils/quantization.py
+++ b/server/text_generation_server/utils/quantization.py
@ -27,7 +27,20 @@ class _FP8QuantizerConfig:
    activation_scale_ub: float
-# We should probably do this with Pytantic JSON deserialization,
+def _get_config_json(model_id: str, revision: Optional[str], filename: str):
    if os.path.exists(
        os.path.join(
            model_id,
        )
    ):
        filename = os.path.join(model_id, filename)
    else:
        filename = hf_hub_download(model_id, filename=filename, revision=revision)
    with open(filename, "r") as f:
        return json.load(f)
 # We should probably do this with Pydantic JSON deserialization,
 # but for now we'll stay close to the old _set_gptq_params.
 def _get_quantizer_config(model_id, revision):
    bits = 4
@ -39,12 +52,7 @@ def _get_quantizer_config(model_id, revision):
    filename = "config.json"
    try:
-        if os.path.exists(os.path.join(model_id, filename)):
+        data = _get_config_json(model_id, revision, filename)
            filename = os.path.join(model_id, filename)
        else:
            filename = hf_hub_download(model_id, filename=filename, revision=revision)
        with open(filename, "r") as f:
            data = json.load(f)
        # FP8 config
        if data["quantization_config"]["quant_method"] == "fbgemm_fp8":
@ -67,14 +75,7 @@ def _get_quantizer_config(model_id, revision):
    except Exception:
        filename = "quantize_config.json"
        try:
-            if os.path.exists(os.path.join(model_id, filename)):
+            data = _get_config_json(model_id, revision, filename)
                filename = os.path.join(model_id, filename)
            else:
                filename = hf_hub_download(
                    model_id, filename=filename, revision=revision
                )
            with open(filename, "r") as f:
                data = json.load(f)
            bits = data["bits"]
            groupsize = data["group_size"]
@ -90,14 +91,7 @@ def _get_quantizer_config(model_id, revision):
        except Exception:
            filename = "quant_config.json"
            try:
-                if os.path.exists(os.path.join(model_id, filename)):
+                data = _get_config_json(model_id, revision, filename)
                    filename = os.path.join(model_id, filename)
                else:
                    filename = hf_hub_download(
                        model_id, filename=filename, revision=revision
                    )
                with open(filename, "r") as f:
                    data = json.load(f)
                bits = data["w_bit"]
                groupsize = data["q_group_size"]
                desc_act = data["desc_act"]
@ -119,6 +113,14 @@ def _get_quantizer_config(model_id, revision):
 def get_loader(
    quantize: Optional[str], model_id: str, revision: Optional[str]
 ) -> WeightsLoader:
    if quantize == "compressed-tensors":
        config = _get_config_json(model_id, revision, "config.json")
        from text_generation_server.layers.compressed_tensors import (
            CompressedTensorsLoader,
        )
        return CompressedTensorsLoader(config)
    quantizer_config = _get_quantizer_config(model_id, revision)
    if quantize in {"awq", "gptq"}:
        from text_generation_server.layers.gptq import GPTQWeightsLoader
		`@ -0,0 +1,3 @@`
							`from .loader import CompressedTensorsLoader`

							`__all__ = ["CompressedTensorsLoader"]`