Merge branch 'main' into moe

2024-11-18 09:45:05 +08:00 · 2024-11-18 09:45:05 +08:00 · e0e39fa0d9
parent 1639152ca4 52e48739a5
commit e0e39fa0d9
42 changed files with 2298 additions and 226 deletions
--- a/16
+++ b/16
@ -161,18 +161,6 @@ COPY server/custom_kernels/ .
 # Build specific version of transformers
 RUN python setup.py build

-# Build vllm CUDA kernels
-FROM kernel-builder AS vllm-builder
-
-WORKDIR /usr/src
-
-ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
-
-COPY server/Makefile-vllm Makefile
-
-# Build specific version of vllm
-RUN make build-vllm-cuda
-
 # Build mamba kernels
 FROM kernel-builder AS mamba-builder
 WORKDIR /usr/src
@ -230,8 +218,6 @@ COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86
 COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from lorax punica kernels builder
 COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-# Copy build artifacts from vllm builder
-COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from mamba builder
 COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages
 COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages
@ -247,7 +233,7 @@ COPY server/Makefile server/Makefile
 RUN cd server && \
    make gen-server && \
    pip install -r requirements_cuda.txt && \
-    pip install ".[bnb, accelerate, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
+    pip install ".[attention, bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
    pip install nvidia-nccl-cu12==2.22.3

 ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
--- a/2
+++ b/2
@ -296,7 +296,7 @@ COPY server/Makefile server/Makefile
 RUN cd server && \
    make gen-server && \
    pip install -r requirements_rocm.txt && \
-    pip install ".[accelerate, peft, outlines]" --no-cache-dir
+    pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir

 # Install benchmarker
 COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
--- a/2
+++ b/2
@ -117,7 +117,7 @@ COPY server/Makefile server/Makefile
 RUN cd server && \
    make gen-server && \
    pip install -r requirements_intel.txt && \
-    pip install ".[accelerate, peft, outlines]" --no-cache-dir
+    pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir

 ENV CCL_ROOT=/opt/intel/oneapi/ccl/latest
 ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest
--- a/benchmark/src/generation.rs
+++ b/benchmark/src/generation.rs
@ -180,7 +180,7 @@ async fn prefill(
    let latency = start_time.elapsed();

    // Compute throughput from latency and batch size
-    let throughput = batch_size as f64 / latency.as_secs_f64();
+    let throughput = (batch_size * sequence_length) as f64 / latency.as_secs_f64();

    // Decode batch cannot be empty
    let decode_batch = decode_batch.expect("decode_batch is None. This is a bug.");
--- a/docs/openapi.json
+++ b/docs/openapi.json
@ -36,8 +36,11 @@
            "content": {
              "application/json": {
                "schema": {
+                  "type": "array",
+                  "items": {
                    "$ref": "#/components/schemas/GenerateResponse"
                  }
+                }
              },
              "text/event-stream": {
                "schema": {
--- a/docs/source/reference/launcher.md
+++ b/docs/source/reference/launcher.md
@ -63,6 +63,7 @@ Options:

          Possible values:
          - awq:                4 bit quantization. Requires a specific AWQ quantized model: <https://hf.co/models?search=awq>. Should replace GPTQ models wherever possible because of the better latency
+          - compressed-tensors: Compressed tensors, which can be a mixture of different quantization methods
          - eetq:               8 bit quantization, doesn't require specific model. Should be a drop-in replacement to bitsandbytes with much better performance. Kernels are from <https://github.com/NetEase-FuXi/EETQ.git>
          - exl2:               Variable bit quantization. Requires a specific EXL2 quantized model: <https://hf.co/models?search=exl2>. Requires exllama2 kernels and does not support tensor parallelism (num_shard > 1)
          - gptq:               4 bit quantization. Requires a specific GTPQ quantized model: <https://hf.co/models?search=gptq>. text-generation-inference will use exllama (faster) kernels wherever possible, and use triton kernel (wider support) when it's not. AWQ has faster kernels
--- a/flake.lock
+++ b/flake.lock
@ -718,11 +718,11 @@
    },
    "nixpkgs_6": {
      "locked": {
-        "lastModified": 1727675176,
-        "narHash": "sha256-xIjBFMYldWvj+g8ahxMPofsj+OqxvKJN6YylNHQ7gn4=",
+        "lastModified": 1731562571,
+        "narHash": "sha256-9V0C/H6NL2Vk3Y76msqNA8TgwZ6Ge4frOVawTNFJQmM=",
        "owner": "nixos",
        "repo": "nixpkgs",
-        "rev": "a6d0207fea9212d28cd3d487efe6bc699663b93a",
+        "rev": "19d66fab291f90ce56d0479b128cc7a5271bf666",
        "type": "github"
      },
      "original": {
@ -978,11 +978,11 @@
        "nixpkgs": "nixpkgs_6"
      },
      "locked": {
-        "lastModified": 1730724647,
-        "narHash": "sha256-SVv+50CGaCoU4zZwsg6ZAaOi/D5QJBL1P2SIB+3CEf4=",
+        "lastModified": 1731674227,
+        "narHash": "sha256-k/ur37KSc+RXcwwz0tgxeamz6wQ5rsOe5hMepzIdD2s=",
        "owner": "huggingface",
        "repo": "text-generation-inference-nix",
-        "rev": "1512898a1e5ad9eff025205fa9c4d33a44506cf3",
+        "rev": "407b9e22a0b7121bf6e171d67ce0144e3f3e39bf",
        "type": "github"
      },
      "original": {
--- a/integration-tests/models/snapshots/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an.json
@ -0,0 +1,104 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 128000,
+        "logprob": null,
+        "text": "<|begin_of_text|>"
+      },
+      {
+        "id": 3923,
+        "logprob": -7.609375,
+        "text": "What"
+      },
+      {
+        "id": 374,
+        "logprob": -0.92529297,
+        "text": " is"
+      },
+      {
+        "id": 5655,
+        "logprob": -10.0,
+        "text": " deep"
+      },
+      {
+        "id": 6975,
+        "logprob": -0.94628906,
+        "text": " learning"
+      },
+      {
+        "id": 30,
+        "logprob": -2.9042969,
+        "text": "?"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 18682,
+        "logprob": -0.8769531,
+        "special": false,
+        "text": " Deep"
+      },
+      {
+        "id": 6975,
+        "logprob": -0.0076942444,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 374,
+        "logprob": -0.25073242,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 264,
+        "logprob": -0.097595215,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 955,
+        "logprob": -0.921875,
+        "special": false,
+        "text": " type"
+      },
+      {
+        "id": 315,
+        "logprob": -0.00027918816,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 21075,
+        "logprob": -0.5527344,
+        "special": false,
+        "text": " artificial"
+      },
+      {
+        "id": 11478,
+        "logprob": -0.042541504,
+        "special": false,
+        "text": " intelligence"
+      },
+      {
+        "id": 320,
+        "logprob": -0.38891602,
+        "special": false,
+        "text": " ("
+      },
+      {
+        "id": 15836,
+        "logprob": -0.0011043549,
+        "special": false,
+        "text": "AI"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": " Deep learning is a type of artificial intelligence (AI"
+}
--- a/integration-tests/models/snapshots/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_all_params.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_all_params.json
@ -0,0 +1,99 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 128000,
+        "logprob": null,
+        "text": "<|begin_of_text|>"
+      },
+      {
+        "id": 3923,
+        "logprob": -7.609375,
+        "text": "What"
+      },
+      {
+        "id": 374,
+        "logprob": -0.92529297,
+        "text": " is"
+      },
+      {
+        "id": 5655,
+        "logprob": -10.0,
+        "text": " deep"
+      },
+      {
+        "id": 6975,
+        "logprob": -0.94628906,
+        "text": " learning"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 5380,
+        "logprob": -0.23840332,
+        "special": false,
+        "text": "?\n"
+      },
+      {
+        "id": 34564,
+        "logprob": 0.0,
+        "special": false,
+        "text": "Deep"
+      },
+      {
+        "id": 6975,
+        "logprob": 0.0,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 11,
+        "logprob": 0.0,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 1101,
+        "logprob": -1.2011719,
+        "special": false,
+        "text": " also"
+      },
+      {
+        "id": 3967,
+        "logprob": 0.0,
+        "special": false,
+        "text": " known"
+      },
+      {
+        "id": 439,
+        "logprob": 0.0,
+        "special": false,
+        "text": " as"
+      },
+      {
+        "id": 30828,
+        "logprob": 0.0,
+        "special": false,
+        "text": " neural"
+      },
+      {
+        "id": 4009,
+        "logprob": -0.6777344,
+        "special": false,
+        "text": " network"
+      },
+      {
+        "id": 477,
+        "logprob": 0.0,
+        "special": false,
+        "text": " or"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "What is deep learning?\nDeep learning, also known as neural network or"
+}
--- a/integration-tests/models/snapshots/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_load.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_load.json
@ -0,0 +1,418 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 128000,
+          "logprob": null,
+          "text": "<|begin_of_text|>"
+        },
+        {
+          "id": 3923,
+          "logprob": -7.609375,
+          "text": "What"
+        },
+        {
+          "id": 374,
+          "logprob": -0.92529297,
+          "text": " is"
+        },
+        {
+          "id": 5655,
+          "logprob": -10.0,
+          "text": " deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.94628906,
+          "text": " learning"
+        },
+        {
+          "id": 30,
+          "logprob": -2.9042969,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 18682,
+          "logprob": -0.8769531,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.0076942444,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 374,
+          "logprob": -0.25146484,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 264,
+          "logprob": -0.097595215,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 955,
+          "logprob": -0.9248047,
+          "special": false,
+          "text": " type"
+        },
+        {
+          "id": 315,
+          "logprob": -0.00027513504,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 21075,
+          "logprob": -0.5527344,
+          "special": false,
+          "text": " artificial"
+        },
+        {
+          "id": 11478,
+          "logprob": -0.043151855,
+          "special": false,
+          "text": " intelligence"
+        },
+        {
+          "id": 320,
+          "logprob": -0.3840332,
+          "special": false,
+          "text": " ("
+        },
+        {
+          "id": 15836,
+          "logprob": -0.0011043549,
+          "special": false,
+          "text": "AI"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " Deep learning is a type of artificial intelligence (AI"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 128000,
+          "logprob": null,
+          "text": "<|begin_of_text|>"
+        },
+        {
+          "id": 3923,
+          "logprob": -7.6054688,
+          "text": "What"
+        },
+        {
+          "id": 374,
+          "logprob": -0.92089844,
+          "text": " is"
+        },
+        {
+          "id": 5655,
+          "logprob": -10.0,
+          "text": " deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.94433594,
+          "text": " learning"
+        },
+        {
+          "id": 30,
+          "logprob": -2.90625,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 18682,
+          "logprob": -0.875,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.007698059,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 374,
+          "logprob": -0.25268555,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 264,
+          "logprob": -0.09753418,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 955,
+          "logprob": -0.92529297,
+          "special": false,
+          "text": " type"
+        },
+        {
+          "id": 315,
+          "logprob": -0.00027942657,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 21075,
+          "logprob": -0.5527344,
+          "special": false,
+          "text": " artificial"
+        },
+        {
+          "id": 11478,
+          "logprob": -0.042541504,
+          "special": false,
+          "text": " intelligence"
+        },
+        {
+          "id": 320,
+          "logprob": -0.3840332,
+          "special": false,
+          "text": " ("
+        },
+        {
+          "id": 15836,
+          "logprob": -0.0011053085,
+          "special": false,
+          "text": "AI"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " Deep learning is a type of artificial intelligence (AI"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 128000,
+          "logprob": null,
+          "text": "<|begin_of_text|>"
+        },
+        {
+          "id": 3923,
+          "logprob": -7.6054688,
+          "text": "What"
+        },
+        {
+          "id": 374,
+          "logprob": -0.92089844,
+          "text": " is"
+        },
+        {
+          "id": 5655,
+          "logprob": -10.0,
+          "text": " deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.94433594,
+          "text": " learning"
+        },
+        {
+          "id": 30,
+          "logprob": -2.90625,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 18682,
+          "logprob": -0.875,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.007698059,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 374,
+          "logprob": -0.25268555,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 264,
+          "logprob": -0.09753418,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 955,
+          "logprob": -0.92529297,
+          "special": false,
+          "text": " type"
+        },
+        {
+          "id": 315,
+          "logprob": -0.00027942657,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 21075,
+          "logprob": -0.5527344,
+          "special": false,
+          "text": " artificial"
+        },
+        {
+          "id": 11478,
+          "logprob": -0.042541504,
+          "special": false,
+          "text": " intelligence"
+        },
+        {
+          "id": 320,
+          "logprob": -0.3840332,
+          "special": false,
+          "text": " ("
+        },
+        {
+          "id": 15836,
+          "logprob": -0.0011053085,
+          "special": false,
+          "text": "AI"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " Deep learning is a type of artificial intelligence (AI"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 128000,
+          "logprob": null,
+          "text": "<|begin_of_text|>"
+        },
+        {
+          "id": 3923,
+          "logprob": -7.6054688,
+          "text": "What"
+        },
+        {
+          "id": 374,
+          "logprob": -0.92089844,
+          "text": " is"
+        },
+        {
+          "id": 5655,
+          "logprob": -10.0,
+          "text": " deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.94433594,
+          "text": " learning"
+        },
+        {
+          "id": 30,
+          "logprob": -2.90625,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 18682,
+          "logprob": -0.875,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.007698059,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 374,
+          "logprob": -0.25268555,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 264,
+          "logprob": -0.09753418,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 955,
+          "logprob": -0.92529297,
+          "special": false,
+          "text": " type"
+        },
+        {
+          "id": 315,
+          "logprob": -0.00027942657,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 21075,
+          "logprob": -0.5527344,
+          "special": false,
+          "text": " artificial"
+        },
+        {
+          "id": 11478,
+          "logprob": -0.042541504,
+          "special": false,
+          "text": " intelligence"
+        },
+        {
+          "id": 320,
+          "logprob": -0.3840332,
+          "special": false,
+          "text": " ("
+        },
+        {
+          "id": 15836,
+          "logprob": -0.0011053085,
+          "special": false,
+          "text": "AI"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " Deep learning is a type of artificial intelligence (AI"
+  }
+]
--- a/integration-tests/models/snapshots/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16.json
@ -0,0 +1,104 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 2,
+        "logprob": null,
+        "text": "<bos>"
+      },
+      {
+        "id": 1841,
+        "logprob": -5.46875,
+        "text": "What"
+      },
+      {
+        "id": 603,
+        "logprob": -0.69140625,
+        "text": " is"
+      },
+      {
+        "id": 5271,
+        "logprob": -12.0,
+        "text": " deep"
+      },
+      {
+        "id": 6044,
+        "logprob": -0.32226562,
+        "text": " learning"
+      },
+      {
+        "id": 235336,
+        "logprob": -0.33203125,
+        "text": "?"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 109,
+        "logprob": -0.24707031,
+        "special": false,
+        "text": "\n\n"
+      },
+      {
+        "id": 26843,
+        "logprob": -0.14550781,
+        "special": false,
+        "text": "Deep"
+      },
+      {
+        "id": 6044,
+        "logprob": -0.038330078,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 603,
+        "logprob": -0.029907227,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 476,
+        "logprob": -0.020996094,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 38397,
+        "logprob": -0.828125,
+        "special": false,
+        "text": " subset"
+      },
+      {
+        "id": 576,
+        "logprob": -0.00049209595,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 6479,
+        "logprob": -0.057373047,
+        "special": false,
+        "text": " machine"
+      },
+      {
+        "id": 6044,
+        "logprob": -0.000207901,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 674,
+        "logprob": -0.15429688,
+        "special": false,
+        "text": " that"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "\n\nDeep learning is a subset of machine learning that"
+}
--- a/integration-tests/models/snapshots/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_all_params.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_all_params.json
@ -0,0 +1,99 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 2,
+        "logprob": null,
+        "text": "<bos>"
+      },
+      {
+        "id": 1841,
+        "logprob": -5.46875,
+        "text": "What"
+      },
+      {
+        "id": 603,
+        "logprob": -0.69140625,
+        "text": " is"
+      },
+      {
+        "id": 5271,
+        "logprob": -12.0,
+        "text": " deep"
+      },
+      {
+        "id": 6044,
+        "logprob": -0.32226562,
+        "text": " learning"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 235336,
+        "logprob": 0.0,
+        "special": false,
+        "text": "?"
+      },
+      {
+        "id": 109,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n\n"
+      },
+      {
+        "id": 26843,
+        "logprob": 0.0,
+        "special": false,
+        "text": "Deep"
+      },
+      {
+        "id": 14715,
+        "logprob": -0.38671875,
+        "special": false,
+        "text": " Learning"
+      },
+      {
+        "id": 603,
+        "logprob": 0.0,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 476,
+        "logprob": 0.0,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 38397,
+        "logprob": -0.12695312,
+        "special": false,
+        "text": " subset"
+      },
+      {
+        "id": 576,
+        "logprob": 0.0,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 6479,
+        "logprob": 0.0,
+        "special": false,
+        "text": " machine"
+      },
+      {
+        "id": 6044,
+        "logprob": 0.0,
+        "special": false,
+        "text": " learning"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "What is deep learning?\n\nDeep Learning is a subset of machine learning"
+}
--- a/integration-tests/models/snapshots/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_load.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_load.json
@ -0,0 +1,418 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2,
+          "logprob": null,
+          "text": "<bos>"
+        },
+        {
+          "id": 1841,
+          "logprob": -5.46875,
+          "text": "What"
+        },
+        {
+          "id": 603,
+          "logprob": -0.69140625,
+          "text": " is"
+        },
+        {
+          "id": 5271,
+          "logprob": -12.0,
+          "text": " deep"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.32226562,
+          "text": " learning"
+        },
+        {
+          "id": 235336,
+          "logprob": -0.33203125,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 109,
+          "logprob": -0.24707031,
+          "special": false,
+          "text": "\n\n"
+        },
+        {
+          "id": 26843,
+          "logprob": -0.14550781,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.03857422,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 603,
+          "logprob": -0.030883789,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 476,
+          "logprob": -0.020996094,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 38397,
+          "logprob": -0.828125,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 576,
+          "logprob": -0.00051498413,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 6479,
+          "logprob": -0.05883789,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.00020694733,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 674,
+          "logprob": -0.15820312,
+          "special": false,
+          "text": " that"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nDeep learning is a subset of machine learning that"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2,
+          "logprob": null,
+          "text": "<bos>"
+        },
+        {
+          "id": 1841,
+          "logprob": -5.46875,
+          "text": "What"
+        },
+        {
+          "id": 603,
+          "logprob": -0.71484375,
+          "text": " is"
+        },
+        {
+          "id": 5271,
+          "logprob": -12.0,
+          "text": " deep"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.30859375,
+          "text": " learning"
+        },
+        {
+          "id": 235336,
+          "logprob": -0.3359375,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 109,
+          "logprob": -0.23828125,
+          "special": false,
+          "text": "\n\n"
+        },
+        {
+          "id": 26843,
+          "logprob": -0.14550781,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.038330078,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 603,
+          "logprob": -0.030883789,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 476,
+          "logprob": -0.020996094,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 38397,
+          "logprob": -0.80859375,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 576,
+          "logprob": -0.0005455017,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 6479,
+          "logprob": -0.05908203,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.00020599365,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 674,
+          "logprob": -0.17285156,
+          "special": false,
+          "text": " that"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nDeep learning is a subset of machine learning that"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2,
+          "logprob": null,
+          "text": "<bos>"
+        },
+        {
+          "id": 1841,
+          "logprob": -5.46875,
+          "text": "What"
+        },
+        {
+          "id": 603,
+          "logprob": -0.71484375,
+          "text": " is"
+        },
+        {
+          "id": 5271,
+          "logprob": -12.0,
+          "text": " deep"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.30859375,
+          "text": " learning"
+        },
+        {
+          "id": 235336,
+          "logprob": -0.3359375,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 109,
+          "logprob": -0.23828125,
+          "special": false,
+          "text": "\n\n"
+        },
+        {
+          "id": 26843,
+          "logprob": -0.14550781,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.038330078,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 603,
+          "logprob": -0.030883789,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 476,
+          "logprob": -0.020996094,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 38397,
+          "logprob": -0.80859375,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 576,
+          "logprob": -0.0005455017,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 6479,
+          "logprob": -0.05908203,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.00020599365,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 674,
+          "logprob": -0.17285156,
+          "special": false,
+          "text": " that"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nDeep learning is a subset of machine learning that"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2,
+          "logprob": null,
+          "text": "<bos>"
+        },
+        {
+          "id": 1841,
+          "logprob": -5.46875,
+          "text": "What"
+        },
+        {
+          "id": 603,
+          "logprob": -0.71484375,
+          "text": " is"
+        },
+        {
+          "id": 5271,
+          "logprob": -12.0,
+          "text": " deep"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.30859375,
+          "text": " learning"
+        },
+        {
+          "id": 235336,
+          "logprob": -0.3359375,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 109,
+          "logprob": -0.23828125,
+          "special": false,
+          "text": "\n\n"
+        },
+        {
+          "id": 26843,
+          "logprob": -0.14550781,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.038330078,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 603,
+          "logprob": -0.030883789,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 476,
+          "logprob": -0.020996094,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 38397,
+          "logprob": -0.80859375,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 576,
+          "logprob": -0.0005455017,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 6479,
+          "logprob": -0.05908203,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.00020599365,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 674,
+          "logprob": -0.17285156,
+          "special": false,
+          "text": " that"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nDeep learning is a subset of machine learning that"
+  }
+]
--- a/integration-tests/models/test_compressed_tensors_w8an_fp.py
+++ b/integration-tests/models/test_compressed_tensors_w8an_fp.py
@ -0,0 +1,86 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def compressed_tensors_w8an_handle(launcher):
+    with launcher(
+        "neuralmagic/Llama-3.2-1B-Instruct-FP8",
+        num_shard=2,
+        quantize="compressed-tensors",
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def compressed_tensors_w8an(compressed_tensors_w8an_handle):
+    await compressed_tensors_w8an_handle.health(300)
+    return compressed_tensors_w8an_handle.client
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_compressed_tensors_w8an(compressed_tensors_w8an, response_snapshot):
+    response = await compressed_tensors_w8an.generate(
+        "What is deep learning?",
+        max_new_tokens=10,
+        decoder_input_details=True,
+    )
+
+    assert (
+        response.generated_text
+        == " Deep learning is a type of artificial intelligence (AI"
+    )
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.asyncio
+async def test_compressed_tensors_w8an_all_params(
+    compressed_tensors_w8an, response_snapshot
+):
+    response = await compressed_tensors_w8an.generate(
+        "What is deep learning",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        stop_sequences=["test"],
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert (
+        response.generated_text
+        == "What is deep learning?\nDeep learning, also known as neural network or"
+    )
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_compressed_tensors_w8an_load(
+    compressed_tensors_w8an, generate_load, response_snapshot
+):
+    responses = await generate_load(
+        compressed_tensors_w8an,
+        "What is deep learning?",
+        max_new_tokens=10,
+        n=4,
+    )
+
+    assert (
+        responses[0].generated_text
+        == " Deep learning is a type of artificial intelligence (AI"
+    )
+    assert len(responses) == 4
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
+
+    assert responses == response_snapshot
--- a/integration-tests/models/test_compressed_tensors_wna16_int.py
+++ b/integration-tests/models/test_compressed_tensors_wna16_int.py
@ -0,0 +1,86 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def compressed_tensors_wna16_handle(launcher):
+    with launcher(
+        "neuralmagic/gemma-2-2b-it-quantized.w4a16",
+        num_shard=2,
+        quantize="compressed-tensors",
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def compressed_tensors_wna16(compressed_tensors_wna16_handle):
+    await compressed_tensors_wna16_handle.health(300)
+    return compressed_tensors_wna16_handle.client
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_compressed_tensors_wna16(compressed_tensors_wna16, response_snapshot):
+    response = await compressed_tensors_wna16.generate(
+        "What is deep learning?",
+        max_new_tokens=10,
+        decoder_input_details=True,
+    )
+
+    assert (
+        response.generated_text
+        == "\n\nDeep learning is a subset of machine learning that"
+    )
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.asyncio
+async def test_compressed_tensors_wna16_all_params(
+    compressed_tensors_wna16, response_snapshot
+):
+    response = await compressed_tensors_wna16.generate(
+        "What is deep learning",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        stop_sequences=["test"],
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert (
+        response.generated_text
+        == "What is deep learning?\n\nDeep Learning is a subset of machine learning"
+    )
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_compressed_tensors_wna16_load(
+    compressed_tensors_wna16, generate_load, response_snapshot
+):
+    responses = await generate_load(
+        compressed_tensors_wna16,
+        "What is deep learning?",
+        max_new_tokens=10,
+        n=4,
+    )
+
+    assert (
+        responses[0].generated_text
+        == "\n\nDeep learning is a subset of machine learning that"
+    )
+    assert len(responses) == 4
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
+
+    assert responses == response_snapshot
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@ -212,6 +212,8 @@ enum Quantization {
    ///   <https://hf.co/models?search=awq>.
    /// Should replace GPTQ models wherever possible because of the better latency
    Awq,
+    /// Compressed tensors, which can be a mixture of different quantization methods.
+    CompressedTensors,
    /// 8 bit quantization, doesn't require specific model.
    /// Should be a drop-in replacement to bitsandbytes with much better performance.
    /// Kernels are from <https://github.com/NetEase-FuXi/EETQ.git>
@ -274,6 +276,9 @@ impl std::fmt::Display for Quantization {
            Quantization::Awq => {
                write!(f, "awq")
            }
+            Quantization::CompressedTensors => {
+                write!(f, "compressed-tensors")
+            }
            Quantization::Eetq => {
                write!(f, "eetq")
            }
--- a/nix/server.nix
+++ b/nix/server.nix
@ -3,8 +3,10 @@
  buildPythonPackage,
  poetry-core,
  mypy-protobuf,
+  attention-kernels,
  awq-inference-engine,
  causal-conv1d,
+  compressed-tensors,
  eetq,
  einops,
  exllamav2,
@ -26,15 +28,18 @@
  opentelemetry-exporter-otlp,
  opentelemetry-instrumentation-grpc,
  opentelemetry-semantic-conventions,
+  outlines,
  peft,
+  prometheus-client,
  punica-kernels,
+  py-cpuinfo,
+  pydantic,
  safetensors,
  tokenizers,
  torch,
  sentencepiece,
  transformers,
  typer,
-  vllm,
 }:

 let
@ -71,9 +76,11 @@ buildPythonPackage {
  pythonRemoveDeps = [ "scipy" ];

  dependencies = [
+    attention-kernels
    awq-inference-engine
    eetq
    causal-conv1d
+    compressed-tensors
    einops
    exllamav2
    flashinfer
@ -93,14 +100,17 @@ buildPythonPackage {
    opentelemetry-exporter-otlp
    opentelemetry-instrumentation-grpc
    opentelemetry-semantic-conventions
+    outlines
    peft
+    prometheus-client
    punica-kernels
+    py-cpuinfo
+    pydantic
    safetensors
    sentencepiece
    tokenizers
    transformers
    typer
-    vllm
  ];

  prePatch = ''
--- a/router/src/infer/mod.rs
+++ b/router/src/infer/mod.rs
@ -10,10 +10,12 @@ use crate::{
 };
 use async_stream::stream;
 use async_trait::async_trait;
+use axum::response::sse::Event;
 use chat_template::ChatTemplate;
 use futures::future::try_join_all;
 use futures::Stream;
 use minijinja::ErrorKind;
+use serde::Serialize;
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::Arc;
 use thiserror::Error;
@ -373,4 +375,26 @@ impl InferError {
            InferError::StreamSerializationError(_) => "stream_serialization_error",
        }
    }
+
+    pub(crate) fn into_openai_event(self) -> Event {
+        Event::default()
+            .json_data(OpenaiErrorEvent {
+                error: APIError {
+                    message: self.to_string(),
+                    http_status_code: 422,
+                },
+            })
+            .unwrap()
+    }
+}
+
+#[derive(Serialize)]
+pub struct APIError {
+    message: String,
+    http_status_code: usize,
+}
+
+#[derive(Serialize)]
+pub struct OpenaiErrorEvent {
+    error: APIError,
 }
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@ -22,11 +22,13 @@ use tracing::warn;
 use utoipa::ToSchema;
 use validation::Validation;

+#[allow(clippy::large_enum_variant)]
 #[derive(Clone)]
 pub enum Tokenizer {
    Python {
        tokenizer_name: String,
        revision: Option<String>,
+        trust_remote_code: bool,
    },
    Rust(tokenizers::Tokenizer),
 }
@ -38,15 +40,20 @@ impl<'a> PyTokenizer<'a> {
        py: Python<'a>,
        tokenizer_name: String,
        revision: Option<String>,
+        trust_remote_code: bool,
    ) -> PyResult<PyTokenizer<'a>> {
        let transformers = py.import_bound("transformers")?;
        let auto = transformers.getattr("AutoTokenizer")?;
        let from_pretrained = auto.getattr("from_pretrained")?;
        let args = (tokenizer_name,);
        let kwargs = if let Some(rev) = &revision {
-            [("revision", rev.to_string())].into_py_dict_bound(py)
+            [
+                ("revision", rev.to_string().into_py(py)),
+                ("trust_remote_code", trust_remote_code.into_py(py)),
+            ]
+            .into_py_dict_bound(py)
        } else {
-            pyo3::types::PyDict::new_bound(py)
+            [("trust_remote_code", trust_remote_code.into_py(py))].into_py_dict_bound(py)
        };
        let tokenizer = from_pretrained.call(args, Some(&kwargs))?;
        tracing::info!("Loaded a python tokenizer");
--- a/router/src/server.rs
+++ b/router/src/server.rs
@ -109,7 +109,7 @@ request_body = CompatGenerateRequest,
 responses(
 (status = 200, description = "Generated Text",
 content(
-("application/json" = GenerateResponse),
+("application/json" = Vec<GenerateResponse>),
 ("text/event-stream" = StreamResponse),
 )),
 (status = 424, description = "Generation Error", body = ErrorResponse,
@ -866,7 +866,7 @@ pub(crate) async fn completions(

                                    yield Ok(event);
                                }
-                                Err(err) => yield Ok(Event::from(err)),
+                                Err(err) => yield Ok(err.into_openai_event()),
                            }
                        }
                    };
@ -1274,7 +1274,8 @@ pub(crate) async fn chat_completions(
            };
            let mut response_as_tool = using_tools;
            while let Some(result) = response_stream.next().await {
-                if let Ok(stream_token) = result {
+                match result{
+                Ok(stream_token) => {
                    let token_text = &stream_token.token.text.clone();
                    match state {
                        StreamState::Buffering => {
@ -1368,6 +1369,8 @@ pub(crate) async fn chat_completions(
                        }
                    }
                }
+                Err(err) => yield Ok(err.into_openai_event())
+                }
            }
            yield Ok::<Event, Infallible>(Event::default().data("[DONE]"));
        };
@ -1829,6 +1832,7 @@ pub async fn run(
            Tokenizer::Python {
                tokenizer_name: tokenizer_name.clone(),
                revision: revision.clone(),
+                trust_remote_code,
            }
        }
    };
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@ -439,9 +439,11 @@ fn tokenizer_worker(
        Tokenizer::Python {
            tokenizer_name,
            revision,
+            trust_remote_code,
        } => {
            pyo3::Python::with_gil(|py| -> pyo3::PyResult<()> {
-                let tokenizer = PyTokenizer::from_py(py, tokenizer_name, revision)?;
+                let tokenizer =
+                    PyTokenizer::from_py(py, tokenizer_name, revision, trust_remote_code)?;
                // Loop over requests
                while let Some(((inputs, add_special_tokens, truncate), response_tx, parent_span)) =
                    receiver.blocking_recv()
--- a/server/Makefile
+++ b/server/Makefile
@ -23,14 +23,14 @@ gen-server:
 install-server: gen-server
 	pip install pip --upgrade
 	pip install -r requirements_cuda.txt
-	pip install -e ".[accelerate, quantize, peft, outlines]"
+	pip install -e ".[accelerate, compressed-tensors, quantize, peft, outlines]"


 install: install-cuda
 	echo "Installed server"

-install-cuda: install-server install-flash-attention-v2-cuda install-vllm-cuda install-flash-attention
-	pip install -e ".[bnb,marlin,moe]"
+install-cuda: install-server install-flash-attention-v2-cuda install-flash-attention
+	pip install -e ".[attention,bnb,marlin,moe]"
 	pip install nvidia-nccl-cu12==2.22.3

 install-rocm: install-server install-flash-attention-v2-rocm  install-vllm-rocm
--- a/server/Makefile-vllm
+++ b/server/Makefile-vllm
@ -1,14 +1,4 @@
-commit_cuda := d243e9dc7e2c9c2e36a4150ec8e64809cb55c01b
 commit_rocm := 4e0929e6e4fa0a3d09d358715c288020ea9dc247
-build-vllm-cuda:
-	if [ ! -d 'vllm' ]; then \
-		pip install -U ninja packaging --no-cache-dir && \
-		git clone https://github.com/Narsil/vllm.git vllm; \
-	fi
-	cd vllm  && git fetch origin && git checkout $(commit_cuda) && python setup.py build
-
-install-vllm-cuda: build-vllm-cuda
-	cd vllm  && git fetch origin && git checkout $(commit_cuda) && pip install -e .

 build-vllm-rocm:
 	if [ ! -d 'vllm' ]; then \
--- a/server/poetry.lock
+++ b/server/poetry.lock
@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.

 [[package]]
 name = "accelerate"
@ -167,6 +167,17 @@ files = [
 [package.dependencies]
 frozenlist = ">=1.1.0"

+[[package]]
+name = "airportsdata"
+version = "20241001"
+description = "Extensive database of location and timezone data for nearly every airport and landing strip in the world."
+optional = true
+python-versions = ">=3.9"
+files = [
+    {file = "airportsdata-20241001-py3-none-any.whl", hash = "sha256:67d71cf2c5378cc17ff66b62b1e11aa2444043949c894543ac8fd8dafce192fd"},
+    {file = "airportsdata-20241001.tar.gz", hash = "sha256:fa0bd143b4f4be3557cb892fa0612ef210fd91a92bd720b4d8221de576a4fa00"},
+]
+
 [[package]]
 name = "annotated-types"
 version = "0.7.0"
@ -189,6 +200,74 @@ files = [
    {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"},
 ]

+[[package]]
+name = "attention-kernels"
+version = "0.1.1"
+description = "Attention kernels"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:812851d4ce0f54ca764ff3815a731b15f0cb110115d0aa2d0997cd7794d808bb"},
+]
+
+[package.dependencies]
+torch = "*"
+
+[package.source]
+type = "url"
+url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl"
+
+[[package]]
+name = "attention-kernels"
+version = "0.1.1"
+description = "Attention kernels"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:614c402621b11dd1f5741a016b9fd27cb6a68814471f2048bc05206923516268"},
+]
+
+[package.dependencies]
+torch = "*"
+
+[package.source]
+type = "url"
+url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl"
+
+[[package]]
+name = "attention-kernels"
+version = "0.1.1"
+description = "Attention kernels"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:6b2ca7c98997431d5f6c4af7553dce6b1bff8dfdec374c97c6ffba71325a02b7"},
+]
+
+[package.dependencies]
+torch = "*"
+
+[package.source]
+type = "url"
+url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl"
+
+[[package]]
+name = "attention-kernels"
+version = "0.1.1"
+description = "Attention kernels"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:a56710c5626e461d6f628ae14b74ffc89833578ebd59c3c0c47f5d6f07461fbf"},
+]
+
+[package.dependencies]
+torch = "*"
+
+[package.source]
+type = "url"
+url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl"
+
 [[package]]
 name = "attrs"
 version = "24.2.0"
@ -388,6 +467,26 @@ files = [
    {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
 ]

+[[package]]
+name = "compressed-tensors"
+version = "0.7.1"
+description = "Library for utilization of compressed safetensors of neural network models"
+optional = true
+python-versions = "*"
+files = [
+    {file = "compressed-tensors-0.7.1.tar.gz", hash = "sha256:3c7865ebfe4ea76ae94d7c674bcf93aedd2064571f682c09a377a219d5ebb3a0"},
+    {file = "compressed_tensors-0.7.1-py3-none-any.whl", hash = "sha256:22d11558a70f655ae647db9c8e9fb14a5e9d6983ca5aec3f267518625fd6dd0e"},
+]
+
+[package.dependencies]
+pydantic = ">=2.0"
+torch = ">=1.7.0"
+transformers = "*"
+
+[package.extras]
+accelerate = ["accelerate"]
+dev = ["black (==22.12.0)", "flake8 (>=3.8.3)", "isort (==5.8.0)", "nbconvert (>=7.16.3)", "pytest (>=6.0.0)", "wheel (>=0.36.2)"]
+
 [[package]]
 name = "datasets"
 version = "2.21.0"
@ -1023,17 +1122,6 @@ MarkupSafe = ">=2.0"
 [package.extras]
 i18n = ["Babel (>=2.7)"]

-[[package]]
-name = "joblib"
-version = "1.4.2"
-description = "Lightweight pipelining with Python functions"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "joblib-1.4.2-py3-none-any.whl", hash = "sha256:06d478d5674cbc267e7496a410ee875abd68e4340feff4490bcb7afb88060ae6"},
-    {file = "joblib-1.4.2.tar.gz", hash = "sha256:2382c5816b2636fbd20a09e0f4e9dad4736765fdfb7dca582943b9c1366b3f0e"},
-]
-
 [[package]]
 name = "jsonschema"
 version = "4.23.0"
@ -1086,36 +1174,6 @@ interegular = ["interegular (>=0.3.1,<0.4.0)"]
 nearley = ["js2py"]
 regex = ["regex"]

-[[package]]
-name = "llvmlite"
-version = "0.43.0"
-description = "lightweight wrapper around basic LLVM functionality"
-optional = true
-python-versions = ">=3.9"
-files = [
-    {file = "llvmlite-0.43.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a289af9a1687c6cf463478f0fa8e8aa3b6fb813317b0d70bf1ed0759eab6f761"},
-    {file = "llvmlite-0.43.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6d4fd101f571a31acb1559ae1af30f30b1dc4b3186669f92ad780e17c81e91bc"},
-    {file = "llvmlite-0.43.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7d434ec7e2ce3cc8f452d1cd9a28591745de022f931d67be688a737320dfcead"},
-    {file = "llvmlite-0.43.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6912a87782acdff6eb8bf01675ed01d60ca1f2551f8176a300a886f09e836a6a"},
-    {file = "llvmlite-0.43.0-cp310-cp310-win_amd64.whl", hash = "sha256:14f0e4bf2fd2d9a75a3534111e8ebeb08eda2f33e9bdd6dfa13282afacdde0ed"},
-    {file = "llvmlite-0.43.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3e8d0618cb9bfe40ac38a9633f2493d4d4e9fcc2f438d39a4e854f39cc0f5f98"},
-    {file = "llvmlite-0.43.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e0a9a1a39d4bf3517f2af9d23d479b4175ead205c592ceeb8b89af48a327ea57"},
-    {file = "llvmlite-0.43.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c1da416ab53e4f7f3bc8d4eeba36d801cc1894b9fbfbf2022b29b6bad34a7df2"},
-    {file = "llvmlite-0.43.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:977525a1e5f4059316b183fb4fd34fa858c9eade31f165427a3977c95e3ee749"},
-    {file = "llvmlite-0.43.0-cp311-cp311-win_amd64.whl", hash = "sha256:d5bd550001d26450bd90777736c69d68c487d17bf371438f975229b2b8241a91"},
-    {file = "llvmlite-0.43.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f99b600aa7f65235a5a05d0b9a9f31150c390f31261f2a0ba678e26823ec38f7"},
-    {file = "llvmlite-0.43.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:35d80d61d0cda2d767f72de99450766250560399edc309da16937b93d3b676e7"},
-    {file = "llvmlite-0.43.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eccce86bba940bae0d8d48ed925f21dbb813519169246e2ab292b5092aba121f"},
-    {file = "llvmlite-0.43.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:df6509e1507ca0760787a199d19439cc887bfd82226f5af746d6977bd9f66844"},
-    {file = "llvmlite-0.43.0-cp312-cp312-win_amd64.whl", hash = "sha256:7a2872ee80dcf6b5dbdc838763d26554c2a18aa833d31a2635bff16aafefb9c9"},
-    {file = "llvmlite-0.43.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9cd2a7376f7b3367019b664c21f0c61766219faa3b03731113ead75107f3b66c"},
-    {file = "llvmlite-0.43.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:18e9953c748b105668487b7c81a3e97b046d8abf95c4ddc0cd3c94f4e4651ae8"},
-    {file = "llvmlite-0.43.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:74937acd22dc11b33946b67dca7680e6d103d6e90eeaaaf932603bec6fe7b03a"},
-    {file = "llvmlite-0.43.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc9efc739cc6ed760f795806f67889923f7274276f0eb45092a1473e40d9b867"},
-    {file = "llvmlite-0.43.0-cp39-cp39-win_amd64.whl", hash = "sha256:47e147cdda9037f94b399bf03bfd8a6b6b1f2f90be94a454e3386f006455a9b4"},
-    {file = "llvmlite-0.43.0.tar.gz", hash = "sha256:ae2b5b5c3ef67354824fb75517c8db5fbe93bc02cd9671f3c62271626bc041d5"},
-]
-
 [[package]]
 name = "loguru"
 version = "0.6.0"
@ -1557,40 +1615,6 @@ doc = ["nb2plots (>=0.7)", "nbconvert (<7.9)", "numpydoc (>=1.6)", "pillow (>=9.
 extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.11)", "sympy (>=1.10)"]
 test = ["pytest (>=7.2)", "pytest-cov (>=4.0)"]

-[[package]]
-name = "numba"
-version = "0.60.0"
-description = "compiling Python code using LLVM"
-optional = true
-python-versions = ">=3.9"
-files = [
-    {file = "numba-0.60.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5d761de835cd38fb400d2c26bb103a2726f548dc30368853121d66201672e651"},
-    {file = "numba-0.60.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:159e618ef213fba758837f9837fb402bbe65326e60ba0633dbe6c7f274d42c1b"},
-    {file = "numba-0.60.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1527dc578b95c7c4ff248792ec33d097ba6bef9eda466c948b68dfc995c25781"},
-    {file = "numba-0.60.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fe0b28abb8d70f8160798f4de9d486143200f34458d34c4a214114e445d7124e"},
-    {file = "numba-0.60.0-cp310-cp310-win_amd64.whl", hash = "sha256:19407ced081d7e2e4b8d8c36aa57b7452e0283871c296e12d798852bc7d7f198"},
-    {file = "numba-0.60.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a17b70fc9e380ee29c42717e8cc0bfaa5556c416d94f9aa96ba13acb41bdece8"},
-    {file = "numba-0.60.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3fb02b344a2a80efa6f677aa5c40cd5dd452e1b35f8d1c2af0dfd9ada9978e4b"},
-    {file = "numba-0.60.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5f4fde652ea604ea3c86508a3fb31556a6157b2c76c8b51b1d45eb40c8598703"},
-    {file = "numba-0.60.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4142d7ac0210cc86432b818338a2bc368dc773a2f5cf1e32ff7c5b378bd63ee8"},
-    {file = "numba-0.60.0-cp311-cp311-win_amd64.whl", hash = "sha256:cac02c041e9b5bc8cf8f2034ff6f0dbafccd1ae9590dc146b3a02a45e53af4e2"},
-    {file = "numba-0.60.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d7da4098db31182fc5ffe4bc42c6f24cd7d1cb8a14b59fd755bfee32e34b8404"},
-    {file = "numba-0.60.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:38d6ea4c1f56417076ecf8fc327c831ae793282e0ff51080c5094cb726507b1c"},
-    {file = "numba-0.60.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:62908d29fb6a3229c242e981ca27e32a6e606cc253fc9e8faeb0e48760de241e"},
-    {file = "numba-0.60.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0ebaa91538e996f708f1ab30ef4d3ddc344b64b5227b67a57aa74f401bb68b9d"},
-    {file = "numba-0.60.0-cp312-cp312-win_amd64.whl", hash = "sha256:f75262e8fe7fa96db1dca93d53a194a38c46da28b112b8a4aca168f0df860347"},
-    {file = "numba-0.60.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:01ef4cd7d83abe087d644eaa3d95831b777aa21d441a23703d649e06b8e06b74"},
-    {file = "numba-0.60.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:819a3dfd4630d95fd574036f99e47212a1af41cbcb019bf8afac63ff56834449"},
-    {file = "numba-0.60.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0b983bd6ad82fe868493012487f34eae8bf7dd94654951404114f23c3466d34b"},
-    {file = "numba-0.60.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c151748cd269ddeab66334bd754817ffc0cabd9433acb0f551697e5151917d25"},
-    {file = "numba-0.60.0-cp39-cp39-win_amd64.whl", hash = "sha256:3031547a015710140e8c87226b4cfe927cac199835e5bf7d4fe5cb64e814e3ab"},
-    {file = "numba-0.60.0.tar.gz", hash = "sha256:5df6158e5584eece5fc83294b949fd30b9f1125df7708862205217e068aabf16"},
-]
-
-[package.dependencies]
-llvmlite = "==0.43.*"
-numpy = ">=1.22,<2.1"
-
 [[package]]
 name = "numpy"
 version = "1.26.4"
@ -1968,36 +1992,83 @@ opentelemetry-api = "1.25.0"

 [[package]]
 name = "outlines"
-version = "0.0.34"
+version = "0.1.3"
 description = "Probabilistic Generative Model Programming"
 optional = true
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 files = [
-    {file = "outlines-0.0.34-py3-none-any.whl", hash = "sha256:911588a7e64a4f193b97fb4c501d98ccfd4e95a98f6a3ada67a280bf0c373c50"},
-    {file = "outlines-0.0.34.tar.gz", hash = "sha256:594e7204c770b47a62eb5c2ba7d25ea0ab2e16882b5f04556712a0228d3d3309"},
+    {file = "outlines-0.1.3-py3-none-any.whl", hash = "sha256:afcf6012b7cabbaae4a58975d03190c0bbc3d402b0b2a37538e05f335d73a247"},
+    {file = "outlines-0.1.3.tar.gz", hash = "sha256:5a48ad00d3bdd8eccaa7574821eb5aaa27ab9f61fde9c3fba52f352dc00197e4"},
 ]

 [package.dependencies]
+airportsdata = "*"
 cloudpickle = "*"
+datasets = "*"
 diskcache = "*"
 interegular = "*"
 jinja2 = "*"
-joblib = "*"
 jsonschema = "*"
 lark = "*"
 nest-asyncio = "*"
-numba = "*"
-numpy = "*"
+numpy = "<2.0.0"
+outlines-core = "0.1.14"
+pycountry = "*"
 pydantic = ">=2.0"
 referencing = "*"
 requests = "*"
-scipy = "*"
-torch = ">=2.1.0"
-transformers = "*"
+torch = "*"
+tqdm = "*"
+typing-extensions = "*"

 [package.extras]
-serve = ["fastapi", "pydantic (>=2.0)", "ray (==2.9.0)", "uvicorn", "vllm (>=0.3.0)"]
-test = ["accelerate", "beartype (<0.16.0)", "coverage[toml] (>=5.1)", "datasets", "diff-cover", "huggingface-hub", "llama-cpp-python (>=0.2.42)", "pre-commit", "pytest", "pytest-benchmark", "pytest-cov", "pytest-mock", "responses", "transformers"]
+serve = ["fastapi", "pydantic (>=2.0)", "uvicorn", "vllm (>=0.3.0)"]
+test = ["accelerate", "beartype (<0.16.0)", "coverage[toml] (>=5.1)", "diff-cover", "exllamav2", "huggingface-hub", "llama-cpp-python", "mlx-lm", "openai (>=1.0.0)", "pillow", "pre-commit", "pytest", "pytest-benchmark", "pytest-cov", "pytest-mock", "responses", "transformers", "vllm"]
+
+[[package]]
+name = "outlines-core"
+version = "0.1.14"
+description = "Structured Text Generation in Rust"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "outlines_core-0.1.14-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:291c6d9d348cb5562cd28ce44d80822d77238f1cd7c30d890b5b20488e71608d"},
+    {file = "outlines_core-0.1.14-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3a50e2f6745e0c34cc857d1bd5590e2966ad06e8ce10802976e9e6c116c7533d"},
+    {file = "outlines_core-0.1.14-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7dfe64b590a6a88dcc5e59f0a399fff0458cdcf97d68de07f08e1bd3bf8ac1d"},
+    {file = "outlines_core-0.1.14-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:100de068ce52893bec316481e65db8f1c734a0f25f540c29dafd7a8afec0a29d"},
+    {file = "outlines_core-0.1.14-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:e06cb724770fd0fe1c8444382c4a6e79901bba33720f70fe6c8437f58eceb92e"},
+    {file = "outlines_core-0.1.14-cp310-cp310-win32.whl", hash = "sha256:6d41da3d8a087fd54133cf910c2d5759da55490bbd0e3bc6c1e7907b54248415"},
+    {file = "outlines_core-0.1.14-cp310-cp310-win_amd64.whl", hash = "sha256:646fd1073feed393bc77f9605a2fa27a54551ab04f85867ce789af1dee6326fa"},
+    {file = "outlines_core-0.1.14-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:60f3a947fe09106f7668cf832c28b9269b8f0fc109f081608acfce9262213359"},
+    {file = "outlines_core-0.1.14-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5e273a100c922f794d8e077a8161d0985d3005887066b4af3ae7afd3742fe9b8"},
+    {file = "outlines_core-0.1.14-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:622e547f11a869fc67be40abc4cbcda89ae6f46f9eb46a1ec0666bd6807e0c67"},
+    {file = "outlines_core-0.1.14-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:60c9933a9faaa51b39aea3518f1822b0d3ec2c9a13b16849caca3955e29e320d"},
+    {file = "outlines_core-0.1.14-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4a8c616ce103ef9102dbf4326f67b03e1e0f46aa19351e57f4beb37588c00428"},
+    {file = "outlines_core-0.1.14-cp311-cp311-win32.whl", hash = "sha256:1c77aaa4556cbb6e93cc42be0a6e262f175e0754b7694d702d642ff03df67f2c"},
+    {file = "outlines_core-0.1.14-cp311-cp311-win_amd64.whl", hash = "sha256:eb6ffe410866f65dbe17e95b0aabd70d990f058a2dc4e8b74f9583b07248cd36"},
+    {file = "outlines_core-0.1.14-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b0e408b033618f23e9bb928a47b33b1bd4c9d04a3dbec680a20977de3b4f590d"},
+    {file = "outlines_core-0.1.14-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:21d1393a6da5d3320e8c8247e9deeb851c5c862fd6ea5c779bd29797e8987155"},
+    {file = "outlines_core-0.1.14-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5829c568db76673d36caaf0f86e96748b491b4a209deb9be87617372394a5fb9"},
+    {file = "outlines_core-0.1.14-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7e855ec99bce1099c0755bcbfa44568adf7ae0083905ba04f58a17614ddf0fe7"},
+    {file = "outlines_core-0.1.14-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b897cfbf9c2719aa011d9b439b4c6751d9c7df5683b2169617972d4b4a914403"},
+    {file = "outlines_core-0.1.14-cp38-cp38-win32.whl", hash = "sha256:4c9d908004b31bcd432156d60f4895bf5e1b51ca8c8eed82b12f1bb57d5bf7fd"},
+    {file = "outlines_core-0.1.14-cp38-cp38-win_amd64.whl", hash = "sha256:6668a930d928216d0b319ad84947903f1e27556f604a9743051f795b11008b64"},
+    {file = "outlines_core-0.1.14-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b745aa469cf3fb347b79a257804d75d1324e01691158664c1e413a816ce6b98d"},
+    {file = "outlines_core-0.1.14-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:27504c8360467429d6223ebc49180d6956d7418bfc3d324f6ad10f069e1813ad"},
+    {file = "outlines_core-0.1.14-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd8f1e1d91a206a520d1c577ce00136de2beb1d200ef93759fd4c9f45abe24d3"},
+    {file = "outlines_core-0.1.14-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f30c8acb42895b624c504b85678331c5f9376fa4b8069ce06a27cf80f5881e27"},
+    {file = "outlines_core-0.1.14-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:0e6cd0e7d995a7b04d90139a695279ab4a9eb7f492618b2c037a85eaf5f9fc59"},
+    {file = "outlines_core-0.1.14-cp39-cp39-win32.whl", hash = "sha256:3104af4084da0e7c3d4b8538b43c725581d66bb68d426bc389680f06c3667476"},
+    {file = "outlines_core-0.1.14-cp39-cp39-win_amd64.whl", hash = "sha256:45c6b9baded0337c4dcfa156af05ec4efd2b25c4d976e77be28146e4037b991f"},
+    {file = "outlines_core-0.1.14.tar.gz", hash = "sha256:6db033e4f8e48381164e36cc716746640ad5022f0d86e4c88af15c75886b93a4"},
+]
+
+[package.dependencies]
+interegular = "*"
+jsonschema = "*"
+
+[package.extras]
+test = ["accelerate", "asv", "beartype (<0.16.0)", "coverage[toml] (>=5.1)", "datasets", "diff-cover", "huggingface-hub", "pillow", "pre-commit", "pydantic", "pytest", "pytest-benchmark", "pytest-cov", "pytest-mock", "setuptools-rust", "torch", "transformers"]

 [[package]]
 name = "packaging"
@ -2470,6 +2541,17 @@ numpy = ">=1.16.6"
 [package.extras]
 test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"]

+[[package]]
+name = "pycountry"
+version = "24.6.1"
+description = "ISO country, subdivision, language, currency and script definitions and their translations"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "pycountry-24.6.1-py3-none-any.whl", hash = "sha256:f1a4fb391cd7214f8eefd39556d740adcc233c778a27f8942c8dca351d6ce06f"},
+    {file = "pycountry-24.6.1.tar.gz", hash = "sha256:b61b3faccea67f87d10c1f2b0fc0be714409e8fcdcc1315613174f6466c10221"},
+]
+
 [[package]]
 name = "pydantic"
 version = "2.9.2"
@ -3971,7 +4053,9 @@ type = ["pytest-mypy"]

 [extras]
 accelerate = ["accelerate"]
+attention = ["attention-kernels", "attention-kernels", "attention-kernels", "attention-kernels"]
 bnb = ["bitsandbytes"]
+compressed-tensors = ["compressed-tensors"]
 marlin = ["marlin-kernels", "marlin-kernels", "marlin-kernels", "marlin-kernels"]
 moe = ["moe-kernels", "moe-kernels", "moe-kernels", "moe-kernels"]
 outlines = ["outlines"]
@ -3982,4 +4066,4 @@ torch = ["torch"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.13"
-content-hash = "b39033e573f50a0f046787aebf1702d86673aad0b2fcee818404fcea7f644b81"
+content-hash = "05add88628d836faceae1a26fde4092651a6eca74555ae38ebff879a7895be7e"
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@ -9,7 +9,7 @@ text-generation-server = 'text_generation_server.cli:app'

 [tool.poetry.dependencies]
 python = ">=3.9,<3.13"
-protobuf = "^4.25.3"
+protobuf = ">=4.25.3,<6"
 grpcio = "^1.51.1"
 grpcio-status = "^1.51.1"
 grpcio-reflection = "^1.51.1"
@ -34,12 +34,19 @@ peft = { version = "^0.10", optional = true }
 torch = { version = "^2.4.0", optional = true }
 scipy = "^1.11.1"
 pillow = "^10.0.0"
-outlines= { version = "^0.0.34", optional = true }
-prometheus-client = "^0.20.0"
+outlines= { version = "^0.1.1", optional = true }
+prometheus-client = ">=0.20.0,<0.22"
 py-cpuinfo = "^9.0.0"
+compressed-tensors = { version = "^0.7.1", optional = true }
 # Remove later, temporary workaround for outlines.
 numpy = "^1.26"

+attention-kernels = [
+  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
+  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
+  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
+  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
+]
 marlin-kernels = [
  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
@ -57,7 +64,9 @@ rich = "^13.7.1"
 [tool.poetry.extras]
 torch = ["torch"]
 accelerate = ["accelerate"]
+attention = ["attention-kernels"]
 bnb = ["bitsandbytes"]
+compressed-tensors = ["compressed-tensors"]
 marlin = ["marlin-kernels"]
 moe = ["moe-kernels"]
 peft = ["peft"]
--- a/server/requirements_cuda.txt
+++ b/server/requirements_cuda.txt
@ -45,7 +45,7 @@ sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13"
 setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13"
 tokenizers==0.20.1 ; python_version >= "3.9" and python_version < "3.13"
 tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.45.2 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.46.0 ; python_version >= "3.9" and python_version < "3.13"
 typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
 typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13"
 urllib3==2.2.3 ; python_version >= "3.9" and python_version < "3.13"
--- a/server/requirements_intel.txt
+++ b/server/requirements_intel.txt
@ -45,7 +45,7 @@ sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13"
 setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13"
 tokenizers==0.20.1 ; python_version >= "3.9" and python_version < "3.13"
 tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.45.2 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.46.0 ; python_version >= "3.9" and python_version < "3.13"
 typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
 typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13"
 urllib3==2.2.3 ; python_version >= "3.9" and python_version < "3.13"
--- a/server/requirements_rocm.txt
+++ b/server/requirements_rocm.txt
@ -45,7 +45,7 @@ sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13"
 setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13"
 tokenizers==0.20.1 ; python_version >= "3.9" and python_version < "3.13"
 tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.45.2 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.46.0 ; python_version >= "3.9" and python_version < "3.13"
 typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
 typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13"
 urllib3==2.2.3 ; python_version >= "3.9" and python_version < "3.13"
--- a/server/text_generation_server/cli.py
+++ b/server/text_generation_server/cli.py
@ -19,6 +19,7 @@ class Quantization(str, Enum):
    bitsandbytes_fp4 = "bitsandbytes-fp4"
    gptq = "gptq"
    awq = "awq"
+    compressed_tensors = "compressed-tensors"
    eetq = "eetq"
    exl2 = "exl2"
    fp8 = "fp8"
--- a/server/text_generation_server/layers/attention/cuda.py
+++ b/server/text_generation_server/layers/attention/cuda.py
@ -108,7 +108,7 @@ def paged_attention(
        if softcap is not None:
            raise RuntimeError("Paged attention doesn't support softcapping")
        input_lengths = seqlen.input_lengths + seqlen.cache_lengths
-        from vllm._C import ops
+        import attention_kernels

        out = torch.empty_like(query)

@ -116,7 +116,7 @@ def paged_attention(
            max_num_partitions == 1 or num_seqs * num_heads > 512
        )
        if use_v1:
-            ops.paged_attention_v1(
+            attention_kernels.paged_attention_v1(
                out,
                query,
                kv_cache.key,
@ -146,7 +146,7 @@ def paged_attention(
            )
            max_logits = torch.empty_like(exp_sums)

-            ops.paged_attention_v2(
+            attention_kernels.paged_attention_v2(
                out,
                exp_sums,
                max_logits,
--- a/server/text_generation_server/layers/attention/kv_cache.py
+++ b/server/text_generation_server/layers/attention/kv_cache.py
@ -200,12 +200,12 @@ def paged_reshape_and_cache(
 ):
    if SYSTEM == "cuda":
        try:
-            from vllm._C import cache_ops
+            import attention_kernels
        except Exception as e:
            raise ImportError(
-                f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}"
+                f"Could not import attention_kernels. Make sure your installation is correct. Complete error: {e}"
            )
-        cache_ops.reshape_and_cache(
+        attention_kernels.reshape_and_cache(
            key, value, key_cache, value_cache, slots, "auto", 1.0
        )
    elif SYSTEM == "rocm":
--- a/server/text_generation_server/layers/compressed_tensors/init.py
+++ b/server/text_generation_server/layers/compressed_tensors/init.py
@ -0,0 +1,3 @@
+from .loader import CompressedTensorsLoader
+
+__all__ = ["CompressedTensorsLoader"]
--- a/server/text_generation_server/layers/compressed_tensors/loader.py
+++ b/server/text_generation_server/layers/compressed_tensors/loader.py
@ -0,0 +1,174 @@
+from typing import Any, Dict, List, Union
+
+from compressed_tensors import QuantizationConfig, QuantizationStatus
+from compressed_tensors.config import CompressionFormat
+from compressed_tensors.quantization import (
+    QuantizationScheme,
+    QuantizationType,
+    find_name_or_class_matches,
+)
+from loguru import logger
+from pydantic import ValidationError
+from torch import nn
+
+from text_generation_server.layers.compressed_tensors.w8an_fp import W8ANFpLoader
+from text_generation_server.layers.compressed_tensors.wna16_int import WNA16Loader
+from text_generation_server.utils.log import log_once
+from text_generation_server.utils.weights import (
+    DefaultWeightsLoader,
+    UnquantizedWeight,
+    Weights,
+    WeightsLoader,
+)
+
+# compressed-tensors can match modules as quantization targets. However,
+# they need to be objects rather than classes or class names. Since we
+# need to match `Linear` targets, make an instance that can be re-used.
+_EMPTY_LINEAR: nn.Module = nn.Linear(0, 0)
+
+
+class CompressedTensorsLoader(WeightsLoader):
+    """Loader for checkpoints stored in the compressed-tensors format."""
+
+    def __init__(self, config: Dict[str, Any]):
+        quantization_config_raw = config.get("quantization_config")
+        if quantization_config_raw is None:
+            # `compression_config` was renamed to `quantization_config`; support
+            # retained for backward compatibility.
+            quantization_config_raw = config.get("compression_config")
+        if quantization_config_raw is None:
+            raise ValueError(
+                "Checkpoint does not have compressed-tensors configuration"
+            )
+
+        try:
+            quantization_config = QuantizationConfig.model_validate(
+                quantization_config_raw
+            )
+        except ValidationError as e:
+            raise ValueError("Cannot parse compressed-tensors configuration") from e
+
+        if quantization_config.quantization_status not in (
+            QuantizationStatus.COMPRESSED,
+            QuantizationStatus.FROZEN,
+        ):
+            raise ValueError(
+                f"Model quantization was not finished, status was: {quantization_config.quantization_status}"
+            )
+
+        self.ignore = (
+            quantization_config.ignore if quantization_config.ignore is not None else []
+        )
+        self.loaders = self._get_target_loaders(quantization_config)
+
+        for target, loader in self.loaders.items():
+            log_once(
+                logger.info,
+                f"Using {loader} for compressed-tensors target '{target}'",
+            )
+
+    def get_weights(self, weights: Weights, prefix: str):
+        loader = self._lookup_loader(prefix)
+        return loader.get_weights(weights, prefix)
+
+    def get_weights_col_packed(
+        self,
+        weights: "Weights",
+        prefix: str,
+        block_sizes: Union[int, List[int]],
+    ):
+        loader = self._lookup_loader(prefix)
+        return loader.get_weights_col_packed(weights, prefix, block_sizes)
+
+    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
+        loader = self._lookup_loader(prefixes[0])
+        return loader.get_multi_weights_col(weights, prefixes, dim)
+
+    def get_weights_row(self, weights: Weights, prefix: str):
+        loader = self._lookup_loader(prefix)
+        return loader.get_weights_row(weights, prefix)
+
+    def _get_target_loaders(
+        self, quantization_config: QuantizationConfig
+    ) -> Dict[str, WeightsLoader]:
+        """
+        A compressed-tensors checkpoint can use different quantizations
+        for different targets. This method returns a dictionary with a
+        loader per target.
+        """
+
+        loaders: Dict[str, WeightsLoader] = {}
+
+        format = quantization_config.format
+
+        for group_name, group in quantization_config.config_groups.items():
+            # The group configuration can be a string, but does that ever
+            # happen in a serialized quantization config?
+            assert isinstance(group, QuantizationScheme)
+
+            loader = self._create_loader_for_group(format, group_name, group)
+
+            # A quantized parameter group can have multiple targets, add the
+            # loader for all the targets.
+            for target in group.targets:
+                if target in loaders:
+                    raise ValueError(
+                        f"Target '{target} has multiple configured loaders'"
+                    )
+                loaders[target] = loader
+
+        return loaders
+
+    def _create_loader_for_group(
+        self, format: str, group_name: str, group: QuantizationScheme
+    ) -> WeightsLoader:
+        """
+        Find and create a loader for the group with the given quantization
+        scheme.
+        """
+        # NOTE: we ignore group.output_activations because we don't support
+        #       output quantization yet.
+
+        input_activations = group.input_activations
+        weights = group.weights
+        if (
+            format
+            in {
+                CompressionFormat.float_quantized.value,
+                CompressionFormat.naive_quantized.value,
+            }
+            and weights is not None
+            and weights.type == QuantizationType.FLOAT
+            and weights.num_bits == 8
+        ):
+            # FP W8A8 or W8A16.
+            return W8ANFpLoader(input_activations=input_activations, weights=weights)
+        elif (
+            format == CompressionFormat.pack_quantized.value
+            and weights is not None
+            and weights.type == QuantizationType.INT
+            and weights.num_bits in (4, 8)
+        ):
+            # INT W4A16 or W8A16 (GPTQ/AWQ-like).
+            return WNA16Loader(weights)
+        else:
+            raise ValueError(
+                f"Group '{group_name}' has unsupported compressed-tensors configurtion"
+            )
+
+    def _lookup_loader(self, prefix: str) -> WeightsLoader:
+        """
+        Look up the loader to use for a given parameter name (prefix).
+        """
+
+        if len(find_name_or_class_matches(prefix, _EMPTY_LINEAR, self.ignore)) > 0:
+            return DefaultWeightsLoader(UnquantizedWeight)
+
+        # We currently only handle linear layers, so unconditionally pass
+        # a `Linear` instance.
+        targets = find_name_or_class_matches(prefix, _EMPTY_LINEAR, self.loaders.keys())
+        if len(targets) == 0:
+            raise ValueError(
+                f"Cannot find compressed-tensors target for prefix: {prefix}"
+            )
+        return self.loaders[targets[0]]
--- a/server/text_generation_server/layers/compressed_tensors/w8an_fp.py
+++ b/server/text_generation_server/layers/compressed_tensors/w8an_fp.py
@ -0,0 +1,174 @@
+from typing import List, Optional, Union
+
+import torch
+from compressed_tensors.quantization import QuantizationArgs, QuantizationType
+
+from text_generation_server.layers.fp8 import Fp8Weight, _load_scalar_or_matrix_scale
+from text_generation_server.utils.weights import Weights, WeightsLoader
+
+
+class W8ANFpLoader(WeightsLoader):
+    """
+    Loader for W8A8/W8A16 FP compressed-tensors parameters.
+    """
+
+    def __init__(
+        self,
+        *,
+        input_activations: Optional[QuantizationArgs],
+        weights: QuantizationArgs,
+    ):
+        assert weights.type == QuantizationType.FLOAT and weights.num_bits == 8
+
+        # We ignore the `strategy` option which sets the scales to be
+        # per-tensor, per-channel or per-token. What scales are supported
+        # is dependent on the kernels used (e.g. cutlass can do tokenwise,
+        # Torch cannot, and FP8-Marlin does not quantize inputs at all).
+        # So, instead we try to use the best-possible configuration.
+
+        self.load_weight_scale = not weights.dynamic
+        self.load_input_scale = (
+            input_activations is not None and not input_activations.dynamic
+        )
+        self.force_w8a16 = (
+            input_activations is not None and input_activations.num_bits == 16
+        )
+
+    def __str__(self) -> str:
+        def scale_to_str(scale):
+            return "static" if scale else "dynamic"
+
+        quantization_type = f"W8A{16 if self.force_w8a16 else 8}"
+
+        return f"{self.__class__.__name__} ({quantization_type}, weight: {scale_to_str(self.load_weight_scale)}, input: {scale_to_str(self.load_input_scale)})"
+
+    def get_weights(self, weights: "Weights", prefix: str):
+        w = weights.get_tensor(f"{prefix}.weight")
+
+        weight_scale = None
+        if self.load_weight_scale:
+            weight_scale = (
+                weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
+                .reshape(-1)
+                .expand(w.shape[0])
+            )
+
+        input_scale = None
+        if self.load_input_scale:
+            input_scale = weights.get_tensor(
+                f"{prefix}.input_scale", to_dtype=False
+            ).reshape(-1)
+
+        return Fp8Weight(
+            weight=w,
+            weight_scale=weight_scale,
+            input_scale=input_scale,
+            dtype=weights.dtype,
+            force_w8a16=self.force_w8a16,
+        )
+
+    def get_weights_col_packed(
+        self,
+        weights: Weights,
+        prefix: str,
+        block_sizes: Union[int, List[int]],
+    ):
+        w = weights.get_packed_sharded(
+            f"{prefix}.weight", dim=0, block_sizes=block_sizes
+        )
+
+        weight_scale = None
+        if self.load_weight_scale:
+            weight_scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
+            if weight_scale.numel() > 1:
+                weight_scale = weights.get_packed_sharded(
+                    f"{prefix}.weight_scale",
+                    dim=0,
+                    block_sizes=block_sizes,
+                    to_dtype=False,
+                )
+            weight_scale = weight_scale.reshape(-1).expand(w.shape[0])
+
+        input_scale = None
+        if self.load_input_scale:
+            input_scale = weights.get_tensor(f"{prefix}.input_scale", to_dtype=False)
+            if input_scale.numel() > 1:
+                input_scale = weights.get_packed_sharded(
+                    f"{prefix}.input_scale",
+                    dim=0,
+                    block_sizes=block_sizes,
+                    to_dtype=False,
+                )
+            input_scale = input_scale.reshape(-1).max()
+
+        return Fp8Weight(
+            weight=w,
+            weight_scale=weight_scale,
+            input_scale=input_scale,
+            dtype=weights.dtype,
+            force_w8a16=self.force_w8a16,
+        )
+
+    def get_multi_weights_col(self, weights: "Weights", prefixes: List[str], dim: int):
+        # FIXME: Force to_device to false as fp8 weights do not support torch.cat on device yet
+        w = [
+            weights.get_sharded(f"{p}.weight", dim=0, to_device=False) for p in prefixes
+        ]
+        shapes = [x.shape for x in w]
+
+        # Concat then send to the device
+        w = torch.cat(w, dim=dim).to(weights.device)
+
+        weight_scale = None
+        if self.load_weight_scale:
+            weight_scale = [
+                _load_scalar_or_matrix_scale(weights, f"{p}.weight_scale", shape)
+                for p, shape in zip(prefixes, shapes)
+            ]
+            weight_scale = torch.cat(weight_scale, dim=0).reshape(-1)
+
+        input_scale = None
+        if self.load_input_scale:
+            input_scale = [
+                _load_scalar_or_matrix_scale(weights, f"{p}.input_scale", shape)
+                for p, shape in zip(prefixes, shapes)
+                if weights.has_tensor(f"{p}.input_scale")
+            ]
+            assert len(input_scale) == 0 or len(input_scale) == len(prefixes)
+            input_scale = (
+                torch.cat(input_scale, dim=0).reshape(-1).max()
+                if len(input_scale) != 0
+                else None
+            )
+
+        return Fp8Weight(
+            weight=w,
+            weight_scale=weight_scale,
+            input_scale=input_scale,
+            dtype=weights.dtype,
+            force_w8a16=self.force_w8a16,
+        )
+
+    def get_weights_row(self, weights: "Weights", prefix: str):
+        w = weights.get_sharded(f"{prefix}.weight", dim=1)
+        weight_scale = None
+        if self.load_weight_scale:
+            weight_scale = (
+                weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
+                .reshape(-1)
+                .expand(w.shape[0])
+            )
+
+        input_scale = None
+        if self.load_input_scale:
+            input_scale = weights.get_tensor(
+                f"{prefix}.input_scale", to_dtype=False
+            ).reshape(-1)
+
+        return Fp8Weight(
+            weight=w,
+            weight_scale=weight_scale,
+            input_scale=input_scale,
+            dtype=weights.dtype,
+            force_w8a16=self.force_w8a16,
+        )
--- a/server/text_generation_server/layers/compressed_tensors/wna16_int.py
+++ b/server/text_generation_server/layers/compressed_tensors/wna16_int.py
@ -0,0 +1,188 @@
+from typing import List, Union
+
+import torch
+from compressed_tensors.quantization import ActivationOrdering, QuantizationArgs
+from loguru import logger
+
+from text_generation_server.layers.marlin.gptq import repack_gptq_for_marlin
+from text_generation_server.utils.log import log_once
+from text_generation_server.utils.weights import Weights, WeightsLoader
+
+
+class WNA16Loader(WeightsLoader):
+    """
+    Loader for W4A16/W8A16 INT compressed-tensors parameters.
+    """
+
+    def __init__(self, weights: QuantizationArgs):
+        self.weights = weights
+        self.desc_act = self.weights.actorder == ActivationOrdering.GROUP
+        self.groupsize = (
+            -1 if self.weights.group_size is None else self.weights.group_size
+        )
+
+    def __str__(self) -> str:
+        quantization_type = f"W{self.weights.num_bits}8A16"
+
+        return f"{self.__class__.__name__} ({quantization_type})"
+
+    def get_weights(self, weights: Weights, prefix: str):
+        log_once(logger.info, "Using GPTQ-Marlin kernels")
+        try:
+            weight_packed = weights.get_tensor(f"{prefix}.weight_packed").t()
+        except RuntimeError:
+            raise RuntimeError(
+                f"Cannot load w{self.weights.num_bits}a16 weight, make sure the model is already quantized"
+            )
+
+        zero_point = None
+        if not self.weights.symmetric:
+            zero_point = weights.get_tensor(f"{prefix}.weight_zero_point").t()
+
+        g_idx = None
+        if self.desc_act:
+            g_idx = weights.get_tensor(f"{prefix}.weight_g_idx")
+
+        scales = weights.get_tensor(f"{prefix}.weight.scales").t()
+
+        return repack_gptq_for_marlin(
+            qweight=weight_packed.contiguous(),
+            scales=scales,
+            qzeros=zero_point,
+            g_idx=g_idx,
+            bits=self.weights.num_bits,
+            desc_act=self.desc_act,
+            groupsize=self.groupsize,
+            quant_method="compressed-tensors",
+            sym=self.weights.symmetric,
+            sharded_infeatures=False,
+        )
+
+    def get_weights_col_packed(
+        self,
+        weights: Weights,
+        prefix: str,
+        block_sizes: Union[int, List[int]],
+    ):
+        try:
+            weight_packed = weights.get_packed_sharded(
+                f"{prefix}.weight_packed", dim=0, block_sizes=block_sizes
+            ).t()
+        except RuntimeError:
+            raise RuntimeError(
+                f"Cannot load w{self.weights.num_bits}a16 weight, make sure the model is already quantized"
+            )
+        scales = weights.get_packed_sharded(
+            f"{prefix}.weight_scale", dim=0, block_sizes=block_sizes
+        ).t()
+        scales = scales.to(dtype=weights.dtype)
+
+        zero_point = None
+        if not self.weights.symmetric:
+            zero_point = weights.get_packed_sharded(
+                f"{prefix}.qzeros", dim=0, block_sizes=block_sizes
+            ).t()
+
+        g_idx = None
+        if self.desc_act:
+            g_idx = weights.get_tensor(f"{prefix}.g_idx")
+
+        return repack_gptq_for_marlin(
+            qweight=weight_packed.contiguous(),
+            scales=scales,
+            qzeros=zero_point,
+            g_idx=g_idx,
+            bits=self.weights.num_bits,
+            desc_act=self.desc_act,
+            groupsize=self.groupsize,
+            quant_method="compressed-tensors",
+            sym=self.weights.symmetric,
+            sharded_infeatures=False,
+        )
+
+    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
+        try:
+            weight_packed = torch.cat(
+                [
+                    weights.get_sharded(f"{p}.weight_packed", dim=0).t()
+                    for p in prefixes
+                ],
+                dim=1,
+            )
+        except RuntimeError:
+            raise RuntimeError(
+                f"Cannot load w{self.weights.num_bits}a16 weight, make sure the model is already quantized"
+            )
+
+        scales = torch.cat(
+            [weights.get_sharded(f"{p}.weight_scale", dim=0).t() for p in prefixes],
+            dim=1,
+        )
+
+        zero_point = None
+        if not self.weights.symmetric:
+            zero_point = torch.cat(
+                [weights.get_sharded(f"{p}.qzeros", dim=0).t() for p in prefixes], dim=1
+            ).t()
+
+        g_idx = None
+        if self.desc_act:
+            w = [weights.get_tensor(f"{p}.g_idx") for p in prefixes]
+            for w2 in w[1:]:
+                torch.testing.assert_close(w2, w[0])
+            g_idx = w[0]
+
+        return repack_gptq_for_marlin(
+            qweight=weight_packed.contiguous(),
+            scales=scales,
+            qzeros=zero_point,
+            g_idx=g_idx,
+            bits=self.weights.num_bits,
+            desc_act=self.desc_act,
+            groupsize=self.groupsize,
+            quant_method="compressed-tensors",
+            sym=self.weights.symmetric,
+            sharded_infeatures=False,
+        )
+
+    def get_weights_row(self, weights: Weights, prefix: str):
+        log_once(logger.info, "Using GPTQ-Marlin kernels")
+        try:
+            weight_packed = weights.get_sharded(f"{prefix}.weight_packed", dim=1).t()
+        except RuntimeError:
+            raise RuntimeError(
+                f"Cannot load `{self.quantize}` weight, make sure the model is already quantized."
+            )
+
+        zero_point = None
+        if not self.weights.symmetric:
+            if self.desc_act or self.groupsize == -1:
+                zero_point = weights.get_tensor(f"{prefix}.weight_zero_point").t()
+            else:
+                zero_point = weights.get_sharded(
+                    f"{prefix}.weight_zero_point", dim=1
+                ).t()
+
+        g_idx = None
+        if self.desc_act:
+            g_idx = weights.get_sharded(f"{prefix}.g_idx", dim=0)
+
+        if self.desc_act or self.groupsize == -1:
+            scales = weights.get_tensor(f"{prefix}.weight_scale").t()
+        else:
+            scales = weights.get_sharded(f"{prefix}.weight_scale", dim=1).t()
+
+        sharded_in_features = weights.process_group.size() > 1
+
+        return repack_gptq_for_marlin(
+            qweight=weight_packed.contiguous(),
+            scales=scales,
+            qzeros=zero_point,
+            g_idx=g_idx,
+            bits=self.weights.num_bits,
+            desc_act=self.desc_act,
+            groupsize=self.groupsize,
+            quant_method="compressed-tensors",
+            sym=self.weights.symmetric,
+            sharded_infeatures=sharded_in_features,
+        )
--- a/server/text_generation_server/layers/fp8.py
+++ b/server/text_generation_server/layers/fp8.py
@ -29,7 +29,7 @@ else:
    CUTLASS_FP8_AVAILABLE = False


-def get_fp8_linear() -> Type[torch.nn.Module]:
+def get_fp8_linear(force_w8a16: bool = False) -> Type[torch.nn.Module]:
    """
    Return an FP8 linear `Module` that is compatible with the current system.
    """
@ -37,7 +37,14 @@ def get_fp8_linear() -> Type[torch.nn.Module]:
    if SYSTEM == "cuda":

        major, _ = torch.cuda.get_device_capability()
-        if major == 8 and os.getenv("USE_CUTLASS_W8A8", "0") != "1":
+        # Marlin is W8A16, use it when:
+        #
+        # - On capability 8.x where x < 8: W8A8 FP8 GEMM is not supported.
+        # - On capability 8.9: W8A8 FP8 GEMM is supported, but Marlin-FP8 is faster.
+        # - On capability 9.x when force_w8a16: cutlass kernels do not support W8A16.
+        if (major == 8 or (major == 9 and force_w8a16)) and os.getenv(
+            "USE_CUTLASS_W8A8", "0"
+        ) != "1":
            # NOTE: Capability 8.9 is supported by cutlass kernels, but FP8-Marlin
            #       gives better decoding throughput on L4 and L40.
            from text_generation_server.layers.marlin import GPTQMarlinFP8Linear
@ -283,14 +290,17 @@ class Fp8Weight(Weight):
    weight_scale: Optional[torch.Tensor] = None
    input_scale: Optional[torch.Tensor] = None
    activation_scale_ub: Optional[float] = None
+    force_w8a16: bool = False

    def get_linear(self, bias: torch.Tensor):
        if self.weight_scale is None:
-            return get_fp8_linear().from_unquant(self.weight, bias, self.dtype)
+            return get_fp8_linear(force_w8a16=self.force_w8a16).from_unquant(
+                self.weight, bias, self.dtype
+            )
        # This is not checked by the fbgemm kernels, but they require contiguous
        # memory. Can be non-contiguous when we e.g. expand from scalars.
        self.weight_scale = self.weight_scale.contiguous()
-        return get_fp8_linear().from_fp8(
+        return get_fp8_linear(force_w8a16=self.force_w8a16).from_fp8(
            weight=self.weight,
            scale=self.weight_scale,
            dtype=self.dtype,
--- a/server/text_generation_server/layers/marlin/gptq.py
+++ b/server/text_generation_server/layers/marlin/gptq.py
@ -261,7 +261,7 @@ class GPTQMarlinWeight(Weight):

    def __post_init__(self):
        assert self.qweight.dtype == torch.int32
-        assert self.scales.dtype == torch.float16
+        assert self.scales.dtype in (torch.float16, torch.bfloat16)
        assert self.g_idx.dtype == torch.int32
        assert self.perm.dtype == torch.int32

@ -300,7 +300,7 @@ def repack_gptq_for_marlin(
        raise RuntimeError(
            f"Repacking GPTQ weights with group size {groupsize} as Marlin is not supported, must be one of: {supported_sizes}"
        )
-    if not (sym or quant_method == "awq"):
+    if not (sym or quant_method == "awq" or quant_method == "compressed-tensors"):
        raise RuntimeError(
            "Repacking GPTQ weights with asymmetric quantization as Marlin is not supported."
        )
--- a/server/text_generation_server/models/init.py
+++ b/server/text_generation_server/models/init.py
@ -370,46 +370,23 @@ def get_model(
    compression_config = config_dict.get("compression_config", None)
    if quantization_config is not None and quantize is None:
        method = quantization_config.get("quant_method", None)
-        config_groups = quantization_config.get("config_groups", None)
        if method in {"gptq", "awq", "exl2"}:
            log_master(logger.info, f"Auto selecting quantization method {method}")
            quantize = method
        elif method == "fbgemm_fp8" or method == "fp8":
            log_master(logger.info, "Auto selecting quantization method fp8")
            quantize = "fp8"
-        elif config_groups is not None:
-            # TODO: at some point we should probably fully parse the compression
-            # configuration to know which parameters are compressed.
-            for _, group in config_groups.items():
-                weights_config = group.get("weights")
-                if weights_config is not None:
-                    if (
-                        weights_config["type"] == "float"
-                        and weights_config["num_bits"] == 8
-                    ):
+        if method == "compressed-tensors":
            log_master(
-                            logger.info, "Auto selecting quantization method fp8"
+                logger.info, "Auto selecting quantization method compressed-tensors"
            )
-                        quantize = "fp8"
-                        break
+            quantize = "compressed-tensors"
        else:
            log_master(logger.warning, f"Unknown quantization method {method}")
    elif compression_config is not None:
        # `compression_config` renamed to `quantization_config`; support retained for backward compatibility.
-        config_groups = compression_config.get("config_groups")
-        if config_groups is not None:
-            for _, group in config_groups.items():
-                weights_config = group.get("weights")
-                if weights_config is not None:
-                    if (
-                        weights_config["type"] == "float"
-                        and weights_config["num_bits"] == 8
-                    ):
-                        log_master(
-                            logger.info, "Auto selecting quantization method fp8"
-                        )
-                        quantize = "fp8"
-                        break
+        log_master(logger.info, "Auto selecting quantization method compressed-tensors")
+        quantize = "compressed-tensors"

    if dtype is None:
        if quantize in ["awq", "exl2", "gptq", "marlin"]:
@ -559,7 +536,7 @@ def get_model(
        # TODO: fix how we determine model type for Mamba
        if "ssm_cfg" in config_dict:
            # *only happens in Mamba case
-            model_type = "ssm"
+            model_type = "mamba"
        else:
            raise RuntimeError(
                f"Could not determine model type for {model_id} revision {revision}"
--- a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
@ -23,8 +23,10 @@ from typing import Optional, List, Tuple, Any
 from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.utils.import_utils import SYSTEM

-if SYSTEM != "ipex":
+if SYSTEM == "rocm":
    from vllm.model_executor.layers.fused_moe import fused_moe
+elif SYSTEM != "ipex":
+    from moe_kernels.fused_moe import fused_moe
 else:
    from intel_extension_for_pytorch.llm.modules import GatedMLPMOE

--- a/server/text_generation_server/models/custom_modeling/mamba_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/mamba_modeling.py
@ -212,7 +212,7 @@ class MambaModel(nn.Module):
        try:
            self.lm_head = SpeculativeHead.load(config, f"{prefix}.embeddings", weights)
        except RuntimeError:
-            self.lm_head = SpeculativeHead.load(config, f"{prefix}.embeddings", weights)
+            self.lm_head = SpeculativeHead.load(config, f"{prefix}.embedding", weights)
        self.config = config

    def forward(
--- a/server/text_generation_server/utils/logits_process.py
+++ b/server/text_generation_server/utils/logits_process.py
@ -5,7 +5,7 @@ from loguru import logger
 from typing import Dict, Union
 from text_generation_server.pb.generate_pb2 import GrammarType

-from outlines.fsm.fsm import RegexFSM
+from outlines.fsm.guide import RegexGuide
 from outlines.fsm.json_schema import build_regex_from_schema
 from functools import lru_cache
 from typing import List, Optional, DefaultDict
@ -482,7 +482,7 @@ class HeterogeneousProcessorWrapper(LogitsProcessor):

 class GrammarLogitProcessor(LogitsProcessor):
    fsm_state: DefaultDict[int, int]
-    fsm: RegexFSM
+    fsm: RegexGuide

    def __init__(self, tokenizer, device, grammar, grammar_type):
        self.device = device
@ -498,8 +498,9 @@ class GrammarLogitProcessor(LogitsProcessor):
    ):
        if fsm_grammar_state == -1 or self.fsm is None:
            return logits
-        allowed_tokens = self.fsm.allowed_token_ids(fsm_grammar_state)
+        allowed_tokens = self.fsm.get_next_instruction(fsm_grammar_state).tokens
        mask = torch.full_like(logits, -math.inf)
+        if allowed_tokens is not None:
            mask[:, allowed_tokens] = 0
        biased_scores = logits + mask
        return biased_scores
@ -513,7 +514,7 @@ class GrammarLogitProcessor(LogitsProcessor):
    def _advance(next_token_id, fsm_grammar_state, fsm):
        if fsm_grammar_state == -1:
            return fsm_grammar_state
-        return fsm.next_state(fsm_grammar_state, next_token_id)
+        return fsm.get_next_state(fsm_grammar_state, next_token_id)

    # TODO: move grammar compilation into the router
    @staticmethod
@ -530,7 +531,7 @@ class GrammarLogitProcessor(LogitsProcessor):
                schema = "(.*?)"
        elif grammar_type == GrammarType.GRAMMAR_TYPE_REGEX:
            pass  # schema is already a regex just here for clarity
-        fsm = RegexFSM(schema, tokenizer)
+        fsm = RegexGuide.from_regex(schema, tokenizer)
        logger.debug(f"Compiled FSM in {time.time() - start_time:.2f}s")
        return fsm

@ -588,7 +589,8 @@ class HeterogeneousGrammarLogitProcessor(LogitsProcessor):
            fsm = self.fsms[i]
            if fsm_grammar_states[i] == -1 or fsm is None:
                continue
-            allowed_tokens = fsm.allowed_token_ids(fsm_grammar_states[i])
+            allowed_tokens = fsm.get_next_instruction(fsm_grammar_states[i]).tokens
+            if allowed_tokens is not None:
                mask[i, allowed_tokens] = 0
            logits[i] += mask[i]
        return logits
--- a/server/text_generation_server/utils/quantization.py
+++ b/server/text_generation_server/utils/quantization.py
@ -27,7 +27,20 @@ class _FP8QuantizerConfig:
    activation_scale_ub: float


-# We should probably do this with Pytantic JSON deserialization,
+def _get_config_json(model_id: str, revision: Optional[str], filename: str):
+    if os.path.exists(
+        os.path.join(
+            model_id,
+        )
+    ):
+        filename = os.path.join(model_id, filename)
+    else:
+        filename = hf_hub_download(model_id, filename=filename, revision=revision)
+    with open(filename, "r") as f:
+        return json.load(f)
+
+
+# We should probably do this with Pydantic JSON deserialization,
 # but for now we'll stay close to the old _set_gptq_params.
 def _get_quantizer_config(model_id, revision):
    bits = 4
@ -39,12 +52,7 @@ def _get_quantizer_config(model_id, revision):

    filename = "config.json"
    try:
-        if os.path.exists(os.path.join(model_id, filename)):
-            filename = os.path.join(model_id, filename)
-        else:
-            filename = hf_hub_download(model_id, filename=filename, revision=revision)
-        with open(filename, "r") as f:
-            data = json.load(f)
+        data = _get_config_json(model_id, revision, filename)

        # FP8 config
        if data["quantization_config"]["quant_method"] == "fbgemm_fp8":
@ -67,14 +75,7 @@ def _get_quantizer_config(model_id, revision):
    except Exception:
        filename = "quantize_config.json"
        try:
-            if os.path.exists(os.path.join(model_id, filename)):
-                filename = os.path.join(model_id, filename)
-            else:
-                filename = hf_hub_download(
-                    model_id, filename=filename, revision=revision
-                )
-            with open(filename, "r") as f:
-                data = json.load(f)
+            data = _get_config_json(model_id, revision, filename)
            bits = data["bits"]
            groupsize = data["group_size"]

@ -90,14 +91,7 @@ def _get_quantizer_config(model_id, revision):
        except Exception:
            filename = "quant_config.json"
            try:
-                if os.path.exists(os.path.join(model_id, filename)):
-                    filename = os.path.join(model_id, filename)
-                else:
-                    filename = hf_hub_download(
-                        model_id, filename=filename, revision=revision
-                    )
-                with open(filename, "r") as f:
-                    data = json.load(f)
+                data = _get_config_json(model_id, revision, filename)
                bits = data["w_bit"]
                groupsize = data["q_group_size"]
                desc_act = data["desc_act"]
@ -119,6 +113,14 @@ def _get_quantizer_config(model_id, revision):
 def get_loader(
    quantize: Optional[str], model_id: str, revision: Optional[str]
 ) -> WeightsLoader:
+    if quantize == "compressed-tensors":
+        config = _get_config_json(model_id, revision, "config.json")
+        from text_generation_server.layers.compressed_tensors import (
+            CompressedTensorsLoader,
+        )
+
+        return CompressedTensorsLoader(config)
+
    quantizer_config = _get_quantizer_config(model_id, revision)
    if quantize in {"awq", "gptq"}:
        from text_generation_server.layers.gptq import GPTQWeightsLoader