Merge branch 'main' into moe

2024-11-18 09:45:05 +08:00 · 2024-11-18 09:45:05 +08:00 · e0e39fa0d9
parent 1639152ca4 52e48739a5
commit e0e39fa0d9
42 changed files with 2298 additions and 226 deletions
--- a/16
+++ b/16
@ -161,18 +161,6 @@ COPY server/custom_kernels/ .
 # Build specific version of transformers
 RUN python setup.py build
 # Build vllm CUDA kernels
 FROM kernel-builder AS vllm-builder
 WORKDIR /usr/src
 ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
 COPY server/Makefile-vllm Makefile
 # Build specific version of vllm
 RUN make build-vllm-cuda
 # Build mamba kernels
 FROM kernel-builder AS mamba-builder
 WORKDIR /usr/src
@ -230,8 +218,6 @@ COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86
 COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from lorax punica kernels builder
 COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from vllm builder
 COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from mamba builder
 COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages
 COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages
@ -247,7 +233,7 @@ COPY server/Makefile server/Makefile
 RUN cd server && \
    make gen-server && \
    pip install -r requirements_cuda.txt && \
-    pip install ".[bnb, accelerate, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
+    pip install ".[attention, bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
    pip install nvidia-nccl-cu12==2.22.3
 ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
--- a/2
+++ b/2
@ -296,7 +296,7 @@ COPY server/Makefile server/Makefile
 RUN cd server && \
    make gen-server && \
    pip install -r requirements_rocm.txt && \
-    pip install ".[accelerate, peft, outlines]" --no-cache-dir
+    pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
 # Install benchmarker
 COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
--- a/2
+++ b/2
@ -117,7 +117,7 @@ COPY server/Makefile server/Makefile
 RUN cd server && \
    make gen-server && \
    pip install -r requirements_intel.txt && \
-    pip install ".[accelerate, peft, outlines]" --no-cache-dir
+    pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
 ENV CCL_ROOT=/opt/intel/oneapi/ccl/latest
 ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest
--- a/benchmark/src/generation.rs
+++ b/benchmark/src/generation.rs
@ -180,7 +180,7 @@ async fn prefill(
    let latency = start_time.elapsed();
    // Compute throughput from latency and batch size
-    let throughput = batch_size as f64 / latency.as_secs_f64();
+    let throughput = (batch_size * sequence_length) as f64 / latency.as_secs_f64();
    // Decode batch cannot be empty
    let decode_batch = decode_batch.expect("decode_batch is None. This is a bug.");
--- a/docs/openapi.json
+++ b/docs/openapi.json
@ -36,7 +36,10 @@
            "content": {
              "application/json": {
                "schema": {
-                  "$ref": "#/components/schemas/GenerateResponse"
+                  "type": "array",
                  "items": {
                    "$ref": "#/components/schemas/GenerateResponse"
                  }
                }
              },
              "text/event-stream": {
--- a/docs/source/reference/launcher.md
+++ b/docs/source/reference/launcher.md
@ -62,15 +62,16 @@ Options:
          [env: QUANTIZE=]
          Possible values:
-          - awq:              4 bit quantization. Requires a specific AWQ quantized model: <https://hf.co/models?search=awq>. Should replace GPTQ models wherever possible because of the better latency
+          - awq:                4 bit quantization. Requires a specific AWQ quantized model: <https://hf.co/models?search=awq>. Should replace GPTQ models wherever possible because of the better latency
-          - eetq:             8 bit quantization, doesn't require specific model. Should be a drop-in replacement to bitsandbytes with much better performance. Kernels are from <https://github.com/NetEase-FuXi/EETQ.git>
+          - compressed-tensors: Compressed tensors, which can be a mixture of different quantization methods
-          - exl2:             Variable bit quantization. Requires a specific EXL2 quantized model: <https://hf.co/models?search=exl2>. Requires exllama2 kernels and does not support tensor parallelism (num_shard > 1)
+          - eetq:               8 bit quantization, doesn't require specific model. Should be a drop-in replacement to bitsandbytes with much better performance. Kernels are from <https://github.com/NetEase-FuXi/EETQ.git>
-          - gptq:             4 bit quantization. Requires a specific GTPQ quantized model: <https://hf.co/models?search=gptq>. text-generation-inference will use exllama (faster) kernels wherever possible, and use triton kernel (wider support) when it's not. AWQ has faster kernels
+          - exl2:               Variable bit quantization. Requires a specific EXL2 quantized model: <https://hf.co/models?search=exl2>. Requires exllama2 kernels and does not support tensor parallelism (num_shard > 1)
-          - marlin:           4 bit quantization. Requires a specific Marlin quantized model: <https://hf.co/models?search=marlin>
+          - gptq:               4 bit quantization. Requires a specific GTPQ quantized model: <https://hf.co/models?search=gptq>. text-generation-inference will use exllama (faster) kernels wherever possible, and use triton kernel (wider support) when it's not. AWQ has faster kernels
-          - bitsandbytes:     Bitsandbytes 8bit. Can be applied on any model, will cut the memory requirement in half, but it is known that the model will be much slower to run than the native f16
+          - marlin:             4 bit quantization. Requires a specific Marlin quantized model: <https://hf.co/models?search=marlin>
-          - bitsandbytes-nf4: Bitsandbytes 4bit. Can be applied on any model, will cut the memory requirement by 4x, but it is known that the model will be much slower to run than the native f16
+          - bitsandbytes:       Bitsandbytes 8bit. Can be applied on any model, will cut the memory requirement in half, but it is known that the model will be much slower to run than the native f16
-          - bitsandbytes-fp4: Bitsandbytes 4bit. nf4 should be preferred in most cases but maybe this one has better perplexity performance for you model
+          - bitsandbytes-nf4:   Bitsandbytes 4bit. Can be applied on any model, will cut the memory requirement by 4x, but it is known that the model will be much slower to run than the native f16
-          - fp8:              [FP8](https://developer.nvidia.com/blog/nvidia-arm-and-intel-publish-fp8-specification-for-standardization-as-an-interchange-format-for-ai/) (e4m3) works on H100 and above This dtype has native ops should be the fastest if available. This is currently not the fastest because of local unpacking + padding to satisfy matrix multiplication limitations
+          - bitsandbytes-fp4:   Bitsandbytes 4bit. nf4 should be preferred in most cases but maybe this one has better perplexity performance for you model
          - fp8:                [FP8](https://developer.nvidia.com/blog/nvidia-arm-and-intel-publish-fp8-specification-for-standardization-as-an-interchange-format-for-ai/) (e4m3) works on H100 and above This dtype has native ops should be the fastest if available. This is currently not the fastest because of local unpacking + padding to satisfy matrix multiplication limitations
 ```
 ## SPECULATE
--- a/flake.lock
+++ b/flake.lock
@ -718,11 +718,11 @@
    },
    "nixpkgs_6": {
      "locked": {
-        "lastModified": 1727675176,
+        "lastModified": 1731562571,
-        "narHash": "sha256-xIjBFMYldWvj+g8ahxMPofsj+OqxvKJN6YylNHQ7gn4=",
+        "narHash": "sha256-9V0C/H6NL2Vk3Y76msqNA8TgwZ6Ge4frOVawTNFJQmM=",
        "owner": "nixos",
        "repo": "nixpkgs",
-        "rev": "a6d0207fea9212d28cd3d487efe6bc699663b93a",
+        "rev": "19d66fab291f90ce56d0479b128cc7a5271bf666",
        "type": "github"
      },
      "original": {
@ -978,11 +978,11 @@
        "nixpkgs": "nixpkgs_6"
      },
      "locked": {
-        "lastModified": 1730724647,
+        "lastModified": 1731674227,
-        "narHash": "sha256-SVv+50CGaCoU4zZwsg6ZAaOi/D5QJBL1P2SIB+3CEf4=",
+        "narHash": "sha256-k/ur37KSc+RXcwwz0tgxeamz6wQ5rsOe5hMepzIdD2s=",
        "owner": "huggingface",
        "repo": "text-generation-inference-nix",
-        "rev": "1512898a1e5ad9eff025205fa9c4d33a44506cf3",
+        "rev": "407b9e22a0b7121bf6e171d67ce0144e3f3e39bf",
        "type": "github"
      },
      "original": {
--- a/integration-tests/models/snapshots/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an.json
@ -0,0 +1,104 @@
 {
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [
      {
        "id": 128000,
        "logprob": null,
        "text": "<|begin_of_text|>"
      },
      {
        "id": 3923,
        "logprob": -7.609375,
        "text": "What"
      },
      {
        "id": 374,
        "logprob": -0.92529297,
        "text": " is"
      },
      {
        "id": 5655,
        "logprob": -10.0,
        "text": " deep"
      },
      {
        "id": 6975,
        "logprob": -0.94628906,
        "text": " learning"
      },
      {
        "id": 30,
        "logprob": -2.9042969,
        "text": "?"
      }
    ],
    "seed": null,
    "tokens": [
      {
        "id": 18682,
        "logprob": -0.8769531,
        "special": false,
        "text": " Deep"
      },
      {
        "id": 6975,
        "logprob": -0.0076942444,
        "special": false,
        "text": " learning"
      },
      {
        "id": 374,
        "logprob": -0.25073242,
        "special": false,
        "text": " is"
      },
      {
        "id": 264,
        "logprob": -0.097595215,
        "special": false,
        "text": " a"
      },
      {
        "id": 955,
        "logprob": -0.921875,
        "special": false,
        "text": " type"
      },
      {
        "id": 315,
        "logprob": -0.00027918816,
        "special": false,
        "text": " of"
      },
      {
        "id": 21075,
        "logprob": -0.5527344,
        "special": false,
        "text": " artificial"
      },
      {
        "id": 11478,
        "logprob": -0.042541504,
        "special": false,
        "text": " intelligence"
      },
      {
        "id": 320,
        "logprob": -0.38891602,
        "special": false,
        "text": " ("
      },
      {
        "id": 15836,
        "logprob": -0.0011043549,
        "special": false,
        "text": "AI"
      }
    ],
    "top_tokens": null
  },
  "generated_text": " Deep learning is a type of artificial intelligence (AI"
 }
--- a/integration-tests/models/snapshots/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_all_params.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_all_params.json
@ -0,0 +1,99 @@
 {
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [
      {
        "id": 128000,
        "logprob": null,
        "text": "<|begin_of_text|>"
      },
      {
        "id": 3923,
        "logprob": -7.609375,
        "text": "What"
      },
      {
        "id": 374,
        "logprob": -0.92529297,
        "text": " is"
      },
      {
        "id": 5655,
        "logprob": -10.0,
        "text": " deep"
      },
      {
        "id": 6975,
        "logprob": -0.94628906,
        "text": " learning"
      }
    ],
    "seed": 0,
    "tokens": [
      {
        "id": 5380,
        "logprob": -0.23840332,
        "special": false,
        "text": "?\n"
      },
      {
        "id": 34564,
        "logprob": 0.0,
        "special": false,
        "text": "Deep"
      },
      {
        "id": 6975,
        "logprob": 0.0,
        "special": false,
        "text": " learning"
      },
      {
        "id": 11,
        "logprob": 0.0,
        "special": false,
        "text": ","
      },
      {
        "id": 1101,
        "logprob": -1.2011719,
        "special": false,
        "text": " also"
      },
      {
        "id": 3967,
        "logprob": 0.0,
        "special": false,
        "text": " known"
      },
      {
        "id": 439,
        "logprob": 0.0,
        "special": false,
        "text": " as"
      },
      {
        "id": 30828,
        "logprob": 0.0,
        "special": false,
        "text": " neural"
      },
      {
        "id": 4009,
        "logprob": -0.6777344,
        "special": false,
        "text": " network"
      },
      {
        "id": 477,
        "logprob": 0.0,
        "special": false,
        "text": " or"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "What is deep learning?\nDeep learning, also known as neural network or"
 }
--- a/integration-tests/models/snapshots/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_load.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_load.json
@ -0,0 +1,418 @@
 [
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 128000,
          "logprob": null,
          "text": "<|begin_of_text|>"
        },
        {
          "id": 3923,
          "logprob": -7.609375,
          "text": "What"
        },
        {
          "id": 374,
          "logprob": -0.92529297,
          "text": " is"
        },
        {
          "id": 5655,
          "logprob": -10.0,
          "text": " deep"
        },
        {
          "id": 6975,
          "logprob": -0.94628906,
          "text": " learning"
        },
        {
          "id": 30,
          "logprob": -2.9042969,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 18682,
          "logprob": -0.8769531,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 6975,
          "logprob": -0.0076942444,
          "special": false,
          "text": " learning"
        },
        {
          "id": 374,
          "logprob": -0.25146484,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
          "logprob": -0.097595215,
          "special": false,
          "text": " a"
        },
        {
          "id": 955,
          "logprob": -0.9248047,
          "special": false,
          "text": " type"
        },
        {
          "id": 315,
          "logprob": -0.00027513504,
          "special": false,
          "text": " of"
        },
        {
          "id": 21075,
          "logprob": -0.5527344,
          "special": false,
          "text": " artificial"
        },
        {
          "id": 11478,
          "logprob": -0.043151855,
          "special": false,
          "text": " intelligence"
        },
        {
          "id": 320,
          "logprob": -0.3840332,
          "special": false,
          "text": " ("
        },
        {
          "id": 15836,
          "logprob": -0.0011043549,
          "special": false,
          "text": "AI"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " Deep learning is a type of artificial intelligence (AI"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 128000,
          "logprob": null,
          "text": "<|begin_of_text|>"
        },
        {
          "id": 3923,
          "logprob": -7.6054688,
          "text": "What"
        },
        {
          "id": 374,
          "logprob": -0.92089844,
          "text": " is"
        },
        {
          "id": 5655,
          "logprob": -10.0,
          "text": " deep"
        },
        {
          "id": 6975,
          "logprob": -0.94433594,
          "text": " learning"
        },
        {
          "id": 30,
          "logprob": -2.90625,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 18682,
          "logprob": -0.875,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 6975,
          "logprob": -0.007698059,
          "special": false,
          "text": " learning"
        },
        {
          "id": 374,
          "logprob": -0.25268555,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
          "logprob": -0.09753418,
          "special": false,
          "text": " a"
        },
        {
          "id": 955,
          "logprob": -0.92529297,
          "special": false,
          "text": " type"
        },
        {
          "id": 315,
          "logprob": -0.00027942657,
          "special": false,
          "text": " of"
        },
        {
          "id": 21075,
          "logprob": -0.5527344,
          "special": false,
          "text": " artificial"
        },
        {
          "id": 11478,
          "logprob": -0.042541504,
          "special": false,
          "text": " intelligence"
        },
        {
          "id": 320,
          "logprob": -0.3840332,
          "special": false,
          "text": " ("
        },
        {
          "id": 15836,
          "logprob": -0.0011053085,
          "special": false,
          "text": "AI"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " Deep learning is a type of artificial intelligence (AI"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 128000,
          "logprob": null,
          "text": "<|begin_of_text|>"
        },
        {
          "id": 3923,
          "logprob": -7.6054688,
          "text": "What"
        },
        {
          "id": 374,
          "logprob": -0.92089844,
          "text": " is"
        },
        {
          "id": 5655,
          "logprob": -10.0,
          "text": " deep"
        },
        {
          "id": 6975,
          "logprob": -0.94433594,
          "text": " learning"
        },
        {
          "id": 30,
          "logprob": -2.90625,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 18682,
          "logprob": -0.875,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 6975,
          "logprob": -0.007698059,
          "special": false,
          "text": " learning"
        },
        {
          "id": 374,
          "logprob": -0.25268555,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
          "logprob": -0.09753418,
          "special": false,
          "text": " a"
        },
        {
          "id": 955,
          "logprob": -0.92529297,
          "special": false,
          "text": " type"
        },
        {
          "id": 315,
          "logprob": -0.00027942657,
          "special": false,
          "text": " of"
        },
        {
          "id": 21075,
          "logprob": -0.5527344,
          "special": false,
          "text": " artificial"
        },
        {
          "id": 11478,
          "logprob": -0.042541504,
          "special": false,
          "text": " intelligence"
        },
        {
          "id": 320,
          "logprob": -0.3840332,
          "special": false,
          "text": " ("
        },
        {
          "id": 15836,
          "logprob": -0.0011053085,
          "special": false,
          "text": "AI"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " Deep learning is a type of artificial intelligence (AI"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 128000,
          "logprob": null,
          "text": "<|begin_of_text|>"
        },
        {
          "id": 3923,
          "logprob": -7.6054688,
          "text": "What"
        },
        {
          "id": 374,
          "logprob": -0.92089844,
          "text": " is"
        },
        {
          "id": 5655,
          "logprob": -10.0,
          "text": " deep"
        },
        {
          "id": 6975,
          "logprob": -0.94433594,
          "text": " learning"
        },
        {
          "id": 30,
          "logprob": -2.90625,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 18682,
          "logprob": -0.875,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 6975,
          "logprob": -0.007698059,
          "special": false,
          "text": " learning"
        },
        {
          "id": 374,
          "logprob": -0.25268555,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
          "logprob": -0.09753418,
          "special": false,
          "text": " a"
        },
        {
          "id": 955,
          "logprob": -0.92529297,
          "special": false,
          "text": " type"
        },
        {
          "id": 315,
          "logprob": -0.00027942657,
          "special": false,
          "text": " of"
        },
        {
          "id": 21075,
          "logprob": -0.5527344,
          "special": false,
          "text": " artificial"
        },
        {
          "id": 11478,
          "logprob": -0.042541504,
          "special": false,
          "text": " intelligence"
        },
        {
          "id": 320,
          "logprob": -0.3840332,
          "special": false,
          "text": " ("
        },
        {
          "id": 15836,
          "logprob": -0.0011053085,
          "special": false,
          "text": "AI"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " Deep learning is a type of artificial intelligence (AI"
  }
 ]
--- a/integration-tests/models/snapshots/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16.json
@ -0,0 +1,104 @@
 {
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [
      {
        "id": 2,
        "logprob": null,
        "text": "<bos>"
      },
      {
        "id": 1841,
        "logprob": -5.46875,
        "text": "What"
      },
      {
        "id": 603,
        "logprob": -0.69140625,
        "text": " is"
      },
      {
        "id": 5271,
        "logprob": -12.0,
        "text": " deep"
      },
      {
        "id": 6044,
        "logprob": -0.32226562,
        "text": " learning"
      },
      {
        "id": 235336,
        "logprob": -0.33203125,
        "text": "?"
      }
    ],
    "seed": null,
    "tokens": [
      {
        "id": 109,
        "logprob": -0.24707031,
        "special": false,
        "text": "\n\n"
      },
      {
        "id": 26843,
        "logprob": -0.14550781,
        "special": false,
        "text": "Deep"
      },
      {
        "id": 6044,
        "logprob": -0.038330078,
        "special": false,
        "text": " learning"
      },
      {
        "id": 603,
        "logprob": -0.029907227,
        "special": false,
        "text": " is"
      },
      {
        "id": 476,
        "logprob": -0.020996094,
        "special": false,
        "text": " a"
      },
      {
        "id": 38397,
        "logprob": -0.828125,
        "special": false,
        "text": " subset"
      },
      {
        "id": 576,
        "logprob": -0.00049209595,
        "special": false,
        "text": " of"
      },
      {
        "id": 6479,
        "logprob": -0.057373047,
        "special": false,
        "text": " machine"
      },
      {
        "id": 6044,
        "logprob": -0.000207901,
        "special": false,
        "text": " learning"
      },
      {
        "id": 674,
        "logprob": -0.15429688,
        "special": false,
        "text": " that"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "\n\nDeep learning is a subset of machine learning that"
 }
--- a/integration-tests/models/snapshots/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_all_params.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_all_params.json
@ -0,0 +1,99 @@
 {
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [
      {
        "id": 2,
        "logprob": null,
        "text": "<bos>"
      },
      {
        "id": 1841,
        "logprob": -5.46875,
        "text": "What"
      },
      {
        "id": 603,
        "logprob": -0.69140625,
        "text": " is"
      },
      {
        "id": 5271,
        "logprob": -12.0,
        "text": " deep"
      },
      {
        "id": 6044,
        "logprob": -0.32226562,
        "text": " learning"
      }
    ],
    "seed": 0,
    "tokens": [
      {
        "id": 235336,
        "logprob": 0.0,
        "special": false,
        "text": "?"
      },
      {
        "id": 109,
        "logprob": 0.0,
        "special": false,
        "text": "\n\n"
      },
      {
        "id": 26843,
        "logprob": 0.0,
        "special": false,
        "text": "Deep"
      },
      {
        "id": 14715,
        "logprob": -0.38671875,
        "special": false,
        "text": " Learning"
      },
      {
        "id": 603,
        "logprob": 0.0,
        "special": false,
        "text": " is"
      },
      {
        "id": 476,
        "logprob": 0.0,
        "special": false,
        "text": " a"
      },
      {
        "id": 38397,
        "logprob": -0.12695312,
        "special": false,
        "text": " subset"
      },
      {
        "id": 576,
        "logprob": 0.0,
        "special": false,
        "text": " of"
      },
      {
        "id": 6479,
        "logprob": 0.0,
        "special": false,
        "text": " machine"
      },
      {
        "id": 6044,
        "logprob": 0.0,
        "special": false,
        "text": " learning"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "What is deep learning?\n\nDeep Learning is a subset of machine learning"
 }
--- a/integration-tests/models/snapshots/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_load.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_load.json
@ -0,0 +1,418 @@
 [
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 2,
          "logprob": null,
          "text": "<bos>"
        },
        {
          "id": 1841,
          "logprob": -5.46875,
          "text": "What"
        },
        {
          "id": 603,
          "logprob": -0.69140625,
          "text": " is"
        },
        {
          "id": 5271,
          "logprob": -12.0,
          "text": " deep"
        },
        {
          "id": 6044,
          "logprob": -0.32226562,
          "text": " learning"
        },
        {
          "id": 235336,
          "logprob": -0.33203125,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 109,
          "logprob": -0.24707031,
          "special": false,
          "text": "\n\n"
        },
        {
          "id": 26843,
          "logprob": -0.14550781,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 6044,
          "logprob": -0.03857422,
          "special": false,
          "text": " learning"
        },
        {
          "id": 603,
          "logprob": -0.030883789,
          "special": false,
          "text": " is"
        },
        {
          "id": 476,
          "logprob": -0.020996094,
          "special": false,
          "text": " a"
        },
        {
          "id": 38397,
          "logprob": -0.828125,
          "special": false,
          "text": " subset"
        },
        {
          "id": 576,
          "logprob": -0.00051498413,
          "special": false,
          "text": " of"
        },
        {
          "id": 6479,
          "logprob": -0.05883789,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6044,
          "logprob": -0.00020694733,
          "special": false,
          "text": " learning"
        },
        {
          "id": 674,
          "logprob": -0.15820312,
          "special": false,
          "text": " that"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nDeep learning is a subset of machine learning that"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 2,
          "logprob": null,
          "text": "<bos>"
        },
        {
          "id": 1841,
          "logprob": -5.46875,
          "text": "What"
        },
        {
          "id": 603,
          "logprob": -0.71484375,
          "text": " is"
        },
        {
          "id": 5271,
          "logprob": -12.0,
          "text": " deep"
        },
        {
          "id": 6044,
          "logprob": -0.30859375,
          "text": " learning"
        },
        {
          "id": 235336,
          "logprob": -0.3359375,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 109,
          "logprob": -0.23828125,
          "special": false,
          "text": "\n\n"
        },
        {
          "id": 26843,
          "logprob": -0.14550781,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 6044,
          "logprob": -0.038330078,
          "special": false,
          "text": " learning"
        },
        {
          "id": 603,
          "logprob": -0.030883789,
          "special": false,
          "text": " is"
        },
        {
          "id": 476,
          "logprob": -0.020996094,
          "special": false,
          "text": " a"
        },
        {
          "id": 38397,
          "logprob": -0.80859375,
          "special": false,
          "text": " subset"
        },
        {
          "id": 576,
          "logprob": -0.0005455017,
          "special": false,
          "text": " of"
        },
        {
          "id": 6479,
          "logprob": -0.05908203,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6044,
          "logprob": -0.00020599365,
          "special": false,
          "text": " learning"
        },
        {
          "id": 674,
          "logprob": -0.17285156,
          "special": false,
          "text": " that"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nDeep learning is a subset of machine learning that"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 2,
          "logprob": null,
          "text": "<bos>"
        },
        {
          "id": 1841,
          "logprob": -5.46875,
          "text": "What"
        },
        {
          "id": 603,
          "logprob": -0.71484375,
          "text": " is"
        },
        {
          "id": 5271,
          "logprob": -12.0,
          "text": " deep"
        },
        {
          "id": 6044,
          "logprob": -0.30859375,
          "text": " learning"
        },
        {
          "id": 235336,
          "logprob": -0.3359375,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 109,
          "logprob": -0.23828125,
          "special": false,
          "text": "\n\n"
        },
        {
          "id": 26843,
          "logprob": -0.14550781,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 6044,
          "logprob": -0.038330078,
          "special": false,
          "text": " learning"
        },
        {
          "id": 603,
          "logprob": -0.030883789,
          "special": false,
          "text": " is"
        },
        {
          "id": 476,
          "logprob": -0.020996094,
          "special": false,
          "text": " a"
        },
        {
          "id": 38397,
          "logprob": -0.80859375,
          "special": false,
          "text": " subset"
        },
        {
          "id": 576,
          "logprob": -0.0005455017,
          "special": false,
          "text": " of"
        },
        {
          "id": 6479,
          "logprob": -0.05908203,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6044,
          "logprob": -0.00020599365,
          "special": false,
          "text": " learning"
        },
        {
          "id": 674,
          "logprob": -0.17285156,
          "special": false,
          "text": " that"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nDeep learning is a subset of machine learning that"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 2,
          "logprob": null,
          "text": "<bos>"
        },
        {
          "id": 1841,
          "logprob": -5.46875,
          "text": "What"
        },
        {
          "id": 603,
          "logprob": -0.71484375,
          "text": " is"
        },
        {
          "id": 5271,
          "logprob": -12.0,
          "text": " deep"
        },
        {
          "id": 6044,
          "logprob": -0.30859375,
          "text": " learning"
        },
        {
          "id": 235336,
          "logprob": -0.3359375,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 109,
          "logprob": -0.23828125,
          "special": false,
          "text": "\n\n"
        },
        {
          "id": 26843,
          "logprob": -0.14550781,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 6044,
          "logprob": -0.038330078,
          "special": false,
          "text": " learning"
        },
        {
          "id": 603,
          "logprob": -0.030883789,
          "special": false,
          "text": " is"
        },
        {
          "id": 476,
          "logprob": -0.020996094,
          "special": false,
          "text": " a"
        },
        {
          "id": 38397,
          "logprob": -0.80859375,
          "special": false,
          "text": " subset"
        },
        {
          "id": 576,
          "logprob": -0.0005455017,
          "special": false,
          "text": " of"
        },
        {
          "id": 6479,
          "logprob": -0.05908203,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6044,
          "logprob": -0.00020599365,
          "special": false,
          "text": " learning"
        },
        {
          "id": 674,
          "logprob": -0.17285156,
          "special": false,
          "text": " that"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nDeep learning is a subset of machine learning that"
  }
 ]
--- a/integration-tests/models/test_compressed_tensors_w8an_fp.py
+++ b/integration-tests/models/test_compressed_tensors_w8an_fp.py
@ -0,0 +1,86 @@
 import pytest
@pytest.fixture(scope="module")
 def compressed_tensors_w8an_handle(launcher):
    with launcher(
        "neuralmagic/Llama-3.2-1B-Instruct-FP8",
        num_shard=2,
        quantize="compressed-tensors",
    ) as handle:
        yield handle
@pytest.fixture(scope="module")
 async def compressed_tensors_w8an(compressed_tensors_w8an_handle):
    await compressed_tensors_w8an_handle.health(300)
    return compressed_tensors_w8an_handle.client
@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
 async def test_compressed_tensors_w8an(compressed_tensors_w8an, response_snapshot):
    response = await compressed_tensors_w8an.generate(
        "What is deep learning?",
        max_new_tokens=10,
        decoder_input_details=True,
    )
    assert (
        response.generated_text
        == " Deep learning is a type of artificial intelligence (AI"
    )
    assert response.details.generated_tokens == 10
    assert response == response_snapshot
@pytest.mark.asyncio
 async def test_compressed_tensors_w8an_all_params(
    compressed_tensors_w8an, response_snapshot
 ):
    response = await compressed_tensors_w8an.generate(
        "What is deep learning",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        stop_sequences=["test"],
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )
    assert response.details.generated_tokens == 10
    assert (
        response.generated_text
        == "What is deep learning?\nDeep learning, also known as neural network or"
    )
    assert response == response_snapshot
@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
 async def test_compressed_tensors_w8an_load(
    compressed_tensors_w8an, generate_load, response_snapshot
 ):
    responses = await generate_load(
        compressed_tensors_w8an,
        "What is deep learning?",
        max_new_tokens=10,
        n=4,
    )
    assert (
        responses[0].generated_text
        == " Deep learning is a type of artificial intelligence (AI"
    )
    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])
    assert responses == response_snapshot
--- a/integration-tests/models/test_compressed_tensors_wna16_int.py
+++ b/integration-tests/models/test_compressed_tensors_wna16_int.py
@ -0,0 +1,86 @@
 import pytest
@pytest.fixture(scope="module")
 def compressed_tensors_wna16_handle(launcher):
    with launcher(
        "neuralmagic/gemma-2-2b-it-quantized.w4a16",
        num_shard=2,
        quantize="compressed-tensors",
    ) as handle:
        yield handle
@pytest.fixture(scope="module")
 async def compressed_tensors_wna16(compressed_tensors_wna16_handle):
    await compressed_tensors_wna16_handle.health(300)
    return compressed_tensors_wna16_handle.client
@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
 async def test_compressed_tensors_wna16(compressed_tensors_wna16, response_snapshot):
    response = await compressed_tensors_wna16.generate(
        "What is deep learning?",
        max_new_tokens=10,
        decoder_input_details=True,
    )
    assert (
        response.generated_text
        == "\n\nDeep learning is a subset of machine learning that"
    )
    assert response.details.generated_tokens == 10
    assert response == response_snapshot
@pytest.mark.asyncio
 async def test_compressed_tensors_wna16_all_params(
    compressed_tensors_wna16, response_snapshot
 ):
    response = await compressed_tensors_wna16.generate(
        "What is deep learning",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        stop_sequences=["test"],
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )
    assert response.details.generated_tokens == 10
    assert (
        response.generated_text
        == "What is deep learning?\n\nDeep Learning is a subset of machine learning"
    )
    assert response == response_snapshot
@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
 async def test_compressed_tensors_wna16_load(
    compressed_tensors_wna16, generate_load, response_snapshot
 ):
    responses = await generate_load(
        compressed_tensors_wna16,
        "What is deep learning?",
        max_new_tokens=10,
        n=4,
    )
    assert (
        responses[0].generated_text
        == "\n\nDeep learning is a subset of machine learning that"
    )
    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])
    assert responses == response_snapshot
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@ -212,6 +212,8 @@ enum Quantization {
    ///   <https://hf.co/models?search=awq>.
    /// Should replace GPTQ models wherever possible because of the better latency
    Awq,
    /// Compressed tensors, which can be a mixture of different quantization methods.
    CompressedTensors,
    /// 8 bit quantization, doesn't require specific model.
    /// Should be a drop-in replacement to bitsandbytes with much better performance.
    /// Kernels are from <https://github.com/NetEase-FuXi/EETQ.git>
@ -274,6 +276,9 @@ impl std::fmt::Display for Quantization {
            Quantization::Awq => {
                write!(f, "awq")
            }
            Quantization::CompressedTensors => {
                write!(f, "compressed-tensors")
            }
            Quantization::Eetq => {
                write!(f, "eetq")
            }
--- a/nix/server.nix
+++ b/nix/server.nix
@ -3,8 +3,10 @@
  buildPythonPackage,
  poetry-core,
  mypy-protobuf,
  attention-kernels,
  awq-inference-engine,
  causal-conv1d,
  compressed-tensors,
  eetq,
  einops,
  exllamav2,
@ -26,15 +28,18 @@
  opentelemetry-exporter-otlp,
  opentelemetry-instrumentation-grpc,
  opentelemetry-semantic-conventions,
  outlines,
  peft,
  prometheus-client,
  punica-kernels,
  py-cpuinfo,
  pydantic,
  safetensors,
  tokenizers,
  torch,
  sentencepiece,
  transformers,
  typer,
  vllm,
 }:
 let
@ -71,9 +76,11 @@ buildPythonPackage {
  pythonRemoveDeps = [ "scipy" ];
  dependencies = [
    attention-kernels
    awq-inference-engine
    eetq
    causal-conv1d
    compressed-tensors
    einops
    exllamav2
    flashinfer
@ -93,14 +100,17 @@ buildPythonPackage {
    opentelemetry-exporter-otlp
    opentelemetry-instrumentation-grpc
    opentelemetry-semantic-conventions
    outlines
    peft
    prometheus-client
    punica-kernels
    py-cpuinfo
    pydantic
    safetensors
    sentencepiece
    tokenizers
    transformers
    typer
    vllm
  ];
  prePatch = ''
--- a/router/src/infer/mod.rs
+++ b/router/src/infer/mod.rs
@ -10,10 +10,12 @@ use crate::{
 };
 use async_stream::stream;
 use async_trait::async_trait;
 use axum::response::sse::Event;
 use chat_template::ChatTemplate;
 use futures::future::try_join_all;
 use futures::Stream;
 use minijinja::ErrorKind;
 use serde::Serialize;
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::Arc;
 use thiserror::Error;
@ -373,4 +375,26 @@ impl InferError {
            InferError::StreamSerializationError(_) => "stream_serialization_error",
        }
    }
    pub(crate) fn into_openai_event(self) -> Event {
        Event::default()
            .json_data(OpenaiErrorEvent {
                error: APIError {
                    message: self.to_string(),
                    http_status_code: 422,
                },
            })
            .unwrap()
    }
 }
 #[derive(Serialize)]
 pub struct APIError {
    message: String,
    http_status_code: usize,
 }
 #[derive(Serialize)]
 pub struct OpenaiErrorEvent {
    error: APIError,
 }
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@ -22,11 +22,13 @@ use tracing::warn;
 use utoipa::ToSchema;
 use validation::Validation;
 #[allow(clippy::large_enum_variant)]
 #[derive(Clone)]
 pub enum Tokenizer {
    Python {
        tokenizer_name: String,
        revision: Option<String>,
        trust_remote_code: bool,
    },
    Rust(tokenizers::Tokenizer),
 }
@ -38,15 +40,20 @@ impl<'a> PyTokenizer<'a> {
        py: Python<'a>,
        tokenizer_name: String,
        revision: Option<String>,
        trust_remote_code: bool,
    ) -> PyResult<PyTokenizer<'a>> {
        let transformers = py.import_bound("transformers")?;
        let auto = transformers.getattr("AutoTokenizer")?;
        let from_pretrained = auto.getattr("from_pretrained")?;
        let args = (tokenizer_name,);
        let kwargs = if let Some(rev) = &revision {
-            [("revision", rev.to_string())].into_py_dict_bound(py)
+            [
                ("revision", rev.to_string().into_py(py)),
                ("trust_remote_code", trust_remote_code.into_py(py)),
            ]
            .into_py_dict_bound(py)
        } else {
-            pyo3::types::PyDict::new_bound(py)
+            [("trust_remote_code", trust_remote_code.into_py(py))].into_py_dict_bound(py)
        };
        let tokenizer = from_pretrained.call(args, Some(&kwargs))?;
        tracing::info!("Loaded a python tokenizer");
--- a/router/src/server.rs
+++ b/router/src/server.rs
@ -109,7 +109,7 @@ request_body = CompatGenerateRequest,
 responses(
 (status = 200, description = "Generated Text",
 content(
-("application/json" = GenerateResponse),
+("application/json" = Vec<GenerateResponse>),
 ("text/event-stream" = StreamResponse),
 )),
 (status = 424, description = "Generation Error", body = ErrorResponse,
@ -866,7 +866,7 @@ pub(crate) async fn completions(
                                    yield Ok(event);
                                }
-                                Err(err) => yield Ok(Event::from(err)),
+                                Err(err) => yield Ok(err.into_openai_event()),
                            }
                        }
                    };
@ -1274,7 +1274,8 @@ pub(crate) async fn chat_completions(
            };
            let mut response_as_tool = using_tools;
            while let Some(result) = response_stream.next().await {
-                if let Ok(stream_token) = result {
+                match result{
                Ok(stream_token) => {
                    let token_text = &stream_token.token.text.clone();
                    match state {
                        StreamState::Buffering => {
@ -1368,6 +1369,8 @@ pub(crate) async fn chat_completions(
                        }
                    }
                }
                Err(err) => yield Ok(err.into_openai_event())
                }
            }
            yield Ok::<Event, Infallible>(Event::default().data("[DONE]"));
        };
@ -1829,6 +1832,7 @@ pub async fn run(
            Tokenizer::Python {
                tokenizer_name: tokenizer_name.clone(),
                revision: revision.clone(),
                trust_remote_code,
            }
        }
    };
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@ -439,9 +439,11 @@ fn tokenizer_worker(
        Tokenizer::Python {
            tokenizer_name,
            revision,
            trust_remote_code,
        } => {
            pyo3::Python::with_gil(|py| -> pyo3::PyResult<()> {
-                let tokenizer = PyTokenizer::from_py(py, tokenizer_name, revision)?;
+                let tokenizer =
                    PyTokenizer::from_py(py, tokenizer_name, revision, trust_remote_code)?;
                // Loop over requests
                while let Some(((inputs, add_special_tokens, truncate), response_tx, parent_span)) =
                    receiver.blocking_recv()
--- a/server/Makefile
+++ b/server/Makefile
@ -23,14 +23,14 @@ gen-server:
 install-server: gen-server
 	pip install pip --upgrade
 	pip install -r requirements_cuda.txt
-	pip install -e ".[accelerate, quantize, peft, outlines]"
+	pip install -e ".[accelerate, compressed-tensors, quantize, peft, outlines]"
 install: install-cuda
 	echo "Installed server"
-install-cuda: install-server install-flash-attention-v2-cuda install-vllm-cuda install-flash-attention
+install-cuda: install-server install-flash-attention-v2-cuda install-flash-attention
-	pip install -e ".[bnb,marlin,moe]"
+	pip install -e ".[attention,bnb,marlin,moe]"
 	pip install nvidia-nccl-cu12==2.22.3
 install-rocm: install-server install-flash-attention-v2-rocm  install-vllm-rocm
--- a/server/Makefile-vllm
+++ b/server/Makefile-vllm
@ -1,14 +1,4 @@
 commit_cuda := d243e9dc7e2c9c2e36a4150ec8e64809cb55c01b
 commit_rocm := 4e0929e6e4fa0a3d09d358715c288020ea9dc247
 build-vllm-cuda:
 	if [ ! -d 'vllm' ]; then \
 		pip install -U ninja packaging --no-cache-dir && \
 		git clone https://github.com/Narsil/vllm.git vllm; \
 	fi
 	cd vllm  && git fetch origin && git checkout $(commit_cuda) && python setup.py build
 install-vllm-cuda: build-vllm-cuda
 	cd vllm  && git fetch origin && git checkout $(commit_cuda) && pip install -e .
 build-vllm-rocm:
 	if [ ! -d 'vllm' ]; then \
--- a/server/poetry.lock
+++ b/server/poetry.lock
@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
 [[package]]
 name = "accelerate"
@ -167,6 +167,17 @@ files = [
 [package.dependencies]
 frozenlist = ">=1.1.0"
 [[package]]
 name = "airportsdata"
 version = "20241001"
 description = "Extensive database of location and timezone data for nearly every airport and landing strip in the world."
 optional = true
 python-versions = ">=3.9"
 files = [
    {file = "airportsdata-20241001-py3-none-any.whl", hash = "sha256:67d71cf2c5378cc17ff66b62b1e11aa2444043949c894543ac8fd8dafce192fd"},
    {file = "airportsdata-20241001.tar.gz", hash = "sha256:fa0bd143b4f4be3557cb892fa0612ef210fd91a92bd720b4d8221de576a4fa00"},
 ]
 [[package]]
 name = "annotated-types"
 version = "0.7.0"
@ -189,6 +200,74 @@ files = [
    {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"},
 ]
 [[package]]
 name = "attention-kernels"
 version = "0.1.1"
 description = "Attention kernels"
 optional = true
 python-versions = ">=3.8"
 files = [
    {file = "attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:812851d4ce0f54ca764ff3815a731b15f0cb110115d0aa2d0997cd7794d808bb"},
 ]
 [package.dependencies]
 torch = "*"
 [package.source]
 type = "url"
 url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl"
 [[package]]
 name = "attention-kernels"
 version = "0.1.1"
 description = "Attention kernels"
 optional = true
 python-versions = ">=3.8"
 files = [
    {file = "attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:614c402621b11dd1f5741a016b9fd27cb6a68814471f2048bc05206923516268"},
 ]
 [package.dependencies]
 torch = "*"
 [package.source]
 type = "url"
 url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl"
 [[package]]
 name = "attention-kernels"
 version = "0.1.1"
 description = "Attention kernels"
 optional = true
 python-versions = ">=3.8"
 files = [
    {file = "attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:6b2ca7c98997431d5f6c4af7553dce6b1bff8dfdec374c97c6ffba71325a02b7"},
 ]
 [package.dependencies]
 torch = "*"
 [package.source]
 type = "url"
 url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl"
 [[package]]
 name = "attention-kernels"
 version = "0.1.1"
 description = "Attention kernels"
 optional = true
 python-versions = ">=3.8"
 files = [
    {file = "attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:a56710c5626e461d6f628ae14b74ffc89833578ebd59c3c0c47f5d6f07461fbf"},
 ]
 [package.dependencies]
 torch = "*"
 [package.source]
 type = "url"
 url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl"
 [[package]]
 name = "attrs"
 version = "24.2.0"
@ -388,6 +467,26 @@ files = [
    {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
 ]
 [[package]]
 name = "compressed-tensors"
 version = "0.7.1"
 description = "Library for utilization of compressed safetensors of neural network models"
 optional = true
 python-versions = "*"
 files = [
    {file = "compressed-tensors-0.7.1.tar.gz", hash = "sha256:3c7865ebfe4ea76ae94d7c674bcf93aedd2064571f682c09a377a219d5ebb3a0"},
    {file = "compressed_tensors-0.7.1-py3-none-any.whl", hash = "sha256:22d11558a70f655ae647db9c8e9fb14a5e9d6983ca5aec3f267518625fd6dd0e"},
 ]
 [package.dependencies]
 pydantic = ">=2.0"
 torch = ">=1.7.0"
 transformers = "*"
 [package.extras]
 accelerate = ["accelerate"]
 dev = ["black (==22.12.0)", "flake8 (>=3.8.3)", "isort (==5.8.0)", "nbconvert (>=7.16.3)", "pytest (>=6.0.0)", "wheel (>=0.36.2)"]
 [[package]]
 name = "datasets"
 version = "2.21.0"
@ -1023,17 +1122,6 @@ MarkupSafe = ">=2.0"
 [package.extras]
 i18n = ["Babel (>=2.7)"]
 [[package]]
 name = "joblib"
 version = "1.4.2"
 description = "Lightweight pipelining with Python functions"
 optional = true
 python-versions = ">=3.8"
 files = [
    {file = "joblib-1.4.2-py3-none-any.whl", hash = "sha256:06d478d5674cbc267e7496a410ee875abd68e4340feff4490bcb7afb88060ae6"},
    {file = "joblib-1.4.2.tar.gz", hash = "sha256:2382c5816b2636fbd20a09e0f4e9dad4736765fdfb7dca582943b9c1366b3f0e"},
 ]
 [[package]]
 name = "jsonschema"
 version = "4.23.0"
@ -1086,36 +1174,6 @@ interegular = ["interegular (>=0.3.1,<0.4.0)"]
 nearley = ["js2py"]
 regex = ["regex"]
 [[package]]
 name = "llvmlite"
 version = "0.43.0"
 description = "lightweight wrapper around basic LLVM functionality"
 optional = true
 python-versions = ">=3.9"
 files = [
    {file = "llvmlite-0.43.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a289af9a1687c6cf463478f0fa8e8aa3b6fb813317b0d70bf1ed0759eab6f761"},
    {file = "llvmlite-0.43.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6d4fd101f571a31acb1559ae1af30f30b1dc4b3186669f92ad780e17c81e91bc"},
    {file = "llvmlite-0.43.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7d434ec7e2ce3cc8f452d1cd9a28591745de022f931d67be688a737320dfcead"},
    {file = "llvmlite-0.43.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6912a87782acdff6eb8bf01675ed01d60ca1f2551f8176a300a886f09e836a6a"},
    {file = "llvmlite-0.43.0-cp310-cp310-win_amd64.whl", hash = "sha256:14f0e4bf2fd2d9a75a3534111e8ebeb08eda2f33e9bdd6dfa13282afacdde0ed"},
    {file = "llvmlite-0.43.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3e8d0618cb9bfe40ac38a9633f2493d4d4e9fcc2f438d39a4e854f39cc0f5f98"},
    {file = "llvmlite-0.43.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e0a9a1a39d4bf3517f2af9d23d479b4175ead205c592ceeb8b89af48a327ea57"},
    {file = "llvmlite-0.43.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c1da416ab53e4f7f3bc8d4eeba36d801cc1894b9fbfbf2022b29b6bad34a7df2"},
    {file = "llvmlite-0.43.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:977525a1e5f4059316b183fb4fd34fa858c9eade31f165427a3977c95e3ee749"},
    {file = "llvmlite-0.43.0-cp311-cp311-win_amd64.whl", hash = "sha256:d5bd550001d26450bd90777736c69d68c487d17bf371438f975229b2b8241a91"},
    {file = "llvmlite-0.43.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f99b600aa7f65235a5a05d0b9a9f31150c390f31261f2a0ba678e26823ec38f7"},
    {file = "llvmlite-0.43.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:35d80d61d0cda2d767f72de99450766250560399edc309da16937b93d3b676e7"},
    {file = "llvmlite-0.43.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eccce86bba940bae0d8d48ed925f21dbb813519169246e2ab292b5092aba121f"},
    {file = "llvmlite-0.43.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:df6509e1507ca0760787a199d19439cc887bfd82226f5af746d6977bd9f66844"},
    {file = "llvmlite-0.43.0-cp312-cp312-win_amd64.whl", hash = "sha256:7a2872ee80dcf6b5dbdc838763d26554c2a18aa833d31a2635bff16aafefb9c9"},
    {file = "llvmlite-0.43.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9cd2a7376f7b3367019b664c21f0c61766219faa3b03731113ead75107f3b66c"},
    {file = "llvmlite-0.43.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:18e9953c748b105668487b7c81a3e97b046d8abf95c4ddc0cd3c94f4e4651ae8"},
    {file = "llvmlite-0.43.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:74937acd22dc11b33946b67dca7680e6d103d6e90eeaaaf932603bec6fe7b03a"},
    {file = "llvmlite-0.43.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc9efc739cc6ed760f795806f67889923f7274276f0eb45092a1473e40d9b867"},
    {file = "llvmlite-0.43.0-cp39-cp39-win_amd64.whl", hash = "sha256:47e147cdda9037f94b399bf03bfd8a6b6b1f2f90be94a454e3386f006455a9b4"},
    {file = "llvmlite-0.43.0.tar.gz", hash = "sha256:ae2b5b5c3ef67354824fb75517c8db5fbe93bc02cd9671f3c62271626bc041d5"},
 ]
 [[package]]
 name = "loguru"
 version = "0.6.0"
@ -1557,40 +1615,6 @@ doc = ["nb2plots (>=0.7)", "nbconvert (<7.9)", "numpydoc (>=1.6)", "pillow (>=9.
 extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.11)", "sympy (>=1.10)"]
 test = ["pytest (>=7.2)", "pytest-cov (>=4.0)"]
 [[package]]
 name = "numba"
 version = "0.60.0"
 description = "compiling Python code using LLVM"
 optional = true
 python-versions = ">=3.9"
 files = [
    {file = "numba-0.60.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5d761de835cd38fb400d2c26bb103a2726f548dc30368853121d66201672e651"},
    {file = "numba-0.60.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:159e618ef213fba758837f9837fb402bbe65326e60ba0633dbe6c7f274d42c1b"},
    {file = "numba-0.60.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1527dc578b95c7c4ff248792ec33d097ba6bef9eda466c948b68dfc995c25781"},
    {file = "numba-0.60.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fe0b28abb8d70f8160798f4de9d486143200f34458d34c4a214114e445d7124e"},
    {file = "numba-0.60.0-cp310-cp310-win_amd64.whl", hash = "sha256:19407ced081d7e2e4b8d8c36aa57b7452e0283871c296e12d798852bc7d7f198"},
    {file = "numba-0.60.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a17b70fc9e380ee29c42717e8cc0bfaa5556c416d94f9aa96ba13acb41bdece8"},
    {file = "numba-0.60.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3fb02b344a2a80efa6f677aa5c40cd5dd452e1b35f8d1c2af0dfd9ada9978e4b"},
    {file = "numba-0.60.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5f4fde652ea604ea3c86508a3fb31556a6157b2c76c8b51b1d45eb40c8598703"},
    {file = "numba-0.60.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4142d7ac0210cc86432b818338a2bc368dc773a2f5cf1e32ff7c5b378bd63ee8"},
    {file = "numba-0.60.0-cp311-cp311-win_amd64.whl", hash = "sha256:cac02c041e9b5bc8cf8f2034ff6f0dbafccd1ae9590dc146b3a02a45e53af4e2"},
    {file = "numba-0.60.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d7da4098db31182fc5ffe4bc42c6f24cd7d1cb8a14b59fd755bfee32e34b8404"},
    {file = "numba-0.60.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:38d6ea4c1f56417076ecf8fc327c831ae793282e0ff51080c5094cb726507b1c"},
    {file = "numba-0.60.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:62908d29fb6a3229c242e981ca27e32a6e606cc253fc9e8faeb0e48760de241e"},
    {file = "numba-0.60.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0ebaa91538e996f708f1ab30ef4d3ddc344b64b5227b67a57aa74f401bb68b9d"},
    {file = "numba-0.60.0-cp312-cp312-win_amd64.whl", hash = "sha256:f75262e8fe7fa96db1dca93d53a194a38c46da28b112b8a4aca168f0df860347"},
    {file = "numba-0.60.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:01ef4cd7d83abe087d644eaa3d95831b777aa21d441a23703d649e06b8e06b74"},
    {file = "numba-0.60.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:819a3dfd4630d95fd574036f99e47212a1af41cbcb019bf8afac63ff56834449"},
    {file = "numba-0.60.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0b983bd6ad82fe868493012487f34eae8bf7dd94654951404114f23c3466d34b"},
    {file = "numba-0.60.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c151748cd269ddeab66334bd754817ffc0cabd9433acb0f551697e5151917d25"},
    {file = "numba-0.60.0-cp39-cp39-win_amd64.whl", hash = "sha256:3031547a015710140e8c87226b4cfe927cac199835e5bf7d4fe5cb64e814e3ab"},
    {file = "numba-0.60.0.tar.gz", hash = "sha256:5df6158e5584eece5fc83294b949fd30b9f1125df7708862205217e068aabf16"},
 ]
 [package.dependencies]
 llvmlite = "==0.43.*"
 numpy = ">=1.22,<2.1"
 [[package]]
 name = "numpy"
 version = "1.26.4"
@ -1968,36 +1992,83 @@ opentelemetry-api = "1.25.0"
 [[package]]
 name = "outlines"
-version = "0.0.34"
+version = "0.1.3"
 description = "Probabilistic Generative Model Programming"
 optional = true
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 files = [
-    {file = "outlines-0.0.34-py3-none-any.whl", hash = "sha256:911588a7e64a4f193b97fb4c501d98ccfd4e95a98f6a3ada67a280bf0c373c50"},
+    {file = "outlines-0.1.3-py3-none-any.whl", hash = "sha256:afcf6012b7cabbaae4a58975d03190c0bbc3d402b0b2a37538e05f335d73a247"},
-    {file = "outlines-0.0.34.tar.gz", hash = "sha256:594e7204c770b47a62eb5c2ba7d25ea0ab2e16882b5f04556712a0228d3d3309"},
+    {file = "outlines-0.1.3.tar.gz", hash = "sha256:5a48ad00d3bdd8eccaa7574821eb5aaa27ab9f61fde9c3fba52f352dc00197e4"},
 ]
 [package.dependencies]
 airportsdata = "*"
 cloudpickle = "*"
 datasets = "*"
 diskcache = "*"
 interegular = "*"
 jinja2 = "*"
 joblib = "*"
 jsonschema = "*"
 lark = "*"
 nest-asyncio = "*"
-numba = "*"
+numpy = "<2.0.0"
-numpy = "*"
+outlines-core = "0.1.14"
 pycountry = "*"
 pydantic = ">=2.0"
 referencing = "*"
 requests = "*"
-scipy = "*"
+torch = "*"
-torch = ">=2.1.0"
+tqdm = "*"
-transformers = "*"
+typing-extensions = "*"
 [package.extras]
-serve = ["fastapi", "pydantic (>=2.0)", "ray (==2.9.0)", "uvicorn", "vllm (>=0.3.0)"]
+serve = ["fastapi", "pydantic (>=2.0)", "uvicorn", "vllm (>=0.3.0)"]
-test = ["accelerate", "beartype (<0.16.0)", "coverage[toml] (>=5.1)", "datasets", "diff-cover", "huggingface-hub", "llama-cpp-python (>=0.2.42)", "pre-commit", "pytest", "pytest-benchmark", "pytest-cov", "pytest-mock", "responses", "transformers"]
+test = ["accelerate", "beartype (<0.16.0)", "coverage[toml] (>=5.1)", "diff-cover", "exllamav2", "huggingface-hub", "llama-cpp-python", "mlx-lm", "openai (>=1.0.0)", "pillow", "pre-commit", "pytest", "pytest-benchmark", "pytest-cov", "pytest-mock", "responses", "transformers", "vllm"]
 [[package]]
 name = "outlines-core"
 version = "0.1.14"
 description = "Structured Text Generation in Rust"
 optional = true
 python-versions = ">=3.8"
 files = [
    {file = "outlines_core-0.1.14-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:291c6d9d348cb5562cd28ce44d80822d77238f1cd7c30d890b5b20488e71608d"},
    {file = "outlines_core-0.1.14-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3a50e2f6745e0c34cc857d1bd5590e2966ad06e8ce10802976e9e6c116c7533d"},
    {file = "outlines_core-0.1.14-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7dfe64b590a6a88dcc5e59f0a399fff0458cdcf97d68de07f08e1bd3bf8ac1d"},
    {file = "outlines_core-0.1.14-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:100de068ce52893bec316481e65db8f1c734a0f25f540c29dafd7a8afec0a29d"},
    {file = "outlines_core-0.1.14-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:e06cb724770fd0fe1c8444382c4a6e79901bba33720f70fe6c8437f58eceb92e"},
    {file = "outlines_core-0.1.14-cp310-cp310-win32.whl", hash = "sha256:6d41da3d8a087fd54133cf910c2d5759da55490bbd0e3bc6c1e7907b54248415"},
    {file = "outlines_core-0.1.14-cp310-cp310-win_amd64.whl", hash = "sha256:646fd1073feed393bc77f9605a2fa27a54551ab04f85867ce789af1dee6326fa"},
    {file = "outlines_core-0.1.14-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:60f3a947fe09106f7668cf832c28b9269b8f0fc109f081608acfce9262213359"},
    {file = "outlines_core-0.1.14-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5e273a100c922f794d8e077a8161d0985d3005887066b4af3ae7afd3742fe9b8"},
    {file = "outlines_core-0.1.14-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:622e547f11a869fc67be40abc4cbcda89ae6f46f9eb46a1ec0666bd6807e0c67"},
    {file = "outlines_core-0.1.14-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:60c9933a9faaa51b39aea3518f1822b0d3ec2c9a13b16849caca3955e29e320d"},
    {file = "outlines_core-0.1.14-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4a8c616ce103ef9102dbf4326f67b03e1e0f46aa19351e57f4beb37588c00428"},
    {file = "outlines_core-0.1.14-cp311-cp311-win32.whl", hash = "sha256:1c77aaa4556cbb6e93cc42be0a6e262f175e0754b7694d702d642ff03df67f2c"},
    {file = "outlines_core-0.1.14-cp311-cp311-win_amd64.whl", hash = "sha256:eb6ffe410866f65dbe17e95b0aabd70d990f058a2dc4e8b74f9583b07248cd36"},
    {file = "outlines_core-0.1.14-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b0e408b033618f23e9bb928a47b33b1bd4c9d04a3dbec680a20977de3b4f590d"},
    {file = "outlines_core-0.1.14-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:21d1393a6da5d3320e8c8247e9deeb851c5c862fd6ea5c779bd29797e8987155"},
    {file = "outlines_core-0.1.14-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5829c568db76673d36caaf0f86e96748b491b4a209deb9be87617372394a5fb9"},
    {file = "outlines_core-0.1.14-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7e855ec99bce1099c0755bcbfa44568adf7ae0083905ba04f58a17614ddf0fe7"},
    {file = "outlines_core-0.1.14-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b897cfbf9c2719aa011d9b439b4c6751d9c7df5683b2169617972d4b4a914403"},
    {file = "outlines_core-0.1.14-cp38-cp38-win32.whl", hash = "sha256:4c9d908004b31bcd432156d60f4895bf5e1b51ca8c8eed82b12f1bb57d5bf7fd"},
    {file = "outlines_core-0.1.14-cp38-cp38-win_amd64.whl", hash = "sha256:6668a930d928216d0b319ad84947903f1e27556f604a9743051f795b11008b64"},
    {file = "outlines_core-0.1.14-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b745aa469cf3fb347b79a257804d75d1324e01691158664c1e413a816ce6b98d"},
    {file = "outlines_core-0.1.14-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:27504c8360467429d6223ebc49180d6956d7418bfc3d324f6ad10f069e1813ad"},
    {file = "outlines_core-0.1.14-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd8f1e1d91a206a520d1c577ce00136de2beb1d200ef93759fd4c9f45abe24d3"},
    {file = "outlines_core-0.1.14-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f30c8acb42895b624c504b85678331c5f9376fa4b8069ce06a27cf80f5881e27"},
    {file = "outlines_core-0.1.14-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:0e6cd0e7d995a7b04d90139a695279ab4a9eb7f492618b2c037a85eaf5f9fc59"},
    {file = "outlines_core-0.1.14-cp39-cp39-win32.whl", hash = "sha256:3104af4084da0e7c3d4b8538b43c725581d66bb68d426bc389680f06c3667476"},
    {file = "outlines_core-0.1.14-cp39-cp39-win_amd64.whl", hash = "sha256:45c6b9baded0337c4dcfa156af05ec4efd2b25c4d976e77be28146e4037b991f"},
    {file = "outlines_core-0.1.14.tar.gz", hash = "sha256:6db033e4f8e48381164e36cc716746640ad5022f0d86e4c88af15c75886b93a4"},
 ]
 [package.dependencies]
 interegular = "*"
 jsonschema = "*"
 [package.extras]
 test = ["accelerate", "asv", "beartype (<0.16.0)", "coverage[toml] (>=5.1)", "datasets", "diff-cover", "huggingface-hub", "pillow", "pre-commit", "pydantic", "pytest", "pytest-benchmark", "pytest-cov", "pytest-mock", "setuptools-rust", "torch", "transformers"]
 [[package]]
 name = "packaging"
@ -2470,6 +2541,17 @@ numpy = ">=1.16.6"
 [package.extras]
 test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"]
 [[package]]
 name = "pycountry"
 version = "24.6.1"
 description = "ISO country, subdivision, language, currency and script definitions and their translations"
 optional = true
 python-versions = ">=3.8"
 files = [
    {file = "pycountry-24.6.1-py3-none-any.whl", hash = "sha256:f1a4fb391cd7214f8eefd39556d740adcc233c778a27f8942c8dca351d6ce06f"},
    {file = "pycountry-24.6.1.tar.gz", hash = "sha256:b61b3faccea67f87d10c1f2b0fc0be714409e8fcdcc1315613174f6466c10221"},
 ]
 [[package]]
 name = "pydantic"
 version = "2.9.2"
@ -3971,7 +4053,9 @@ type = ["pytest-mypy"]
 [extras]
 accelerate = ["accelerate"]
 attention = ["attention-kernels", "attention-kernels", "attention-kernels", "attention-kernels"]
 bnb = ["bitsandbytes"]
 compressed-tensors = ["compressed-tensors"]
 marlin = ["marlin-kernels", "marlin-kernels", "marlin-kernels", "marlin-kernels"]
 moe = ["moe-kernels", "moe-kernels", "moe-kernels", "moe-kernels"]
 outlines = ["outlines"]
@ -3982,4 +4066,4 @@ torch = ["torch"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.13"
-content-hash = "b39033e573f50a0f046787aebf1702d86673aad0b2fcee818404fcea7f644b81"
+content-hash = "05add88628d836faceae1a26fde4092651a6eca74555ae38ebff879a7895be7e"
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@ -9,7 +9,7 @@ text-generation-server = 'text_generation_server.cli:app'
 [tool.poetry.dependencies]
 python = ">=3.9,<3.13"
-protobuf = "^4.25.3"
+protobuf = ">=4.25.3,<6"
 grpcio = "^1.51.1"
 grpcio-status = "^1.51.1"
 grpcio-reflection = "^1.51.1"
@ -34,12 +34,19 @@ peft = { version = "^0.10", optional = true }
 torch = { version = "^2.4.0", optional = true }
 scipy = "^1.11.1"
 pillow = "^10.0.0"
-outlines= { version = "^0.0.34", optional = true }
+outlines= { version = "^0.1.1", optional = true }
-prometheus-client = "^0.20.0"
+prometheus-client = ">=0.20.0,<0.22"
 py-cpuinfo = "^9.0.0"
 compressed-tensors = { version = "^0.7.1", optional = true }
 # Remove later, temporary workaround for outlines.
 numpy = "^1.26"
 attention-kernels = [
  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
 ]
 marlin-kernels = [
  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
@ -57,7 +64,9 @@ rich = "^13.7.1"
 [tool.poetry.extras]
 torch = ["torch"]
 accelerate = ["accelerate"]
 attention = ["attention-kernels"]
 bnb = ["bitsandbytes"]
 compressed-tensors = ["compressed-tensors"]
 marlin = ["marlin-kernels"]
 moe = ["moe-kernels"]
 peft = ["peft"]
--- a/server/requirements_cuda.txt
+++ b/server/requirements_cuda.txt
@ -45,7 +45,7 @@ sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13"
 setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13"
 tokenizers==0.20.1 ; python_version >= "3.9" and python_version < "3.13"
 tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.45.2 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.46.0 ; python_version >= "3.9" and python_version < "3.13"
 typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
 typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13"
 urllib3==2.2.3 ; python_version >= "3.9" and python_version < "3.13"
--- a/server/requirements_intel.txt
+++ b/server/requirements_intel.txt
@ -45,7 +45,7 @@ sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13"
 setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13"
 tokenizers==0.20.1 ; python_version >= "3.9" and python_version < "3.13"
 tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.45.2 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.46.0 ; python_version >= "3.9" and python_version < "3.13"
 typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
 typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13"
 urllib3==2.2.3 ; python_version >= "3.9" and python_version < "3.13"
--- a/server/requirements_rocm.txt
+++ b/server/requirements_rocm.txt
@ -45,7 +45,7 @@ sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13"
 setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13"
 tokenizers==0.20.1 ; python_version >= "3.9" and python_version < "3.13"
 tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.45.2 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.46.0 ; python_version >= "3.9" and python_version < "3.13"
 typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
 typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13"
 urllib3==2.2.3 ; python_version >= "3.9" and python_version < "3.13"
--- a/server/text_generation_server/cli.py
+++ b/server/text_generation_server/cli.py
@ -19,6 +19,7 @@ class Quantization(str, Enum):
    bitsandbytes_fp4 = "bitsandbytes-fp4"
    gptq = "gptq"
    awq = "awq"
    compressed_tensors = "compressed-tensors"
    eetq = "eetq"
    exl2 = "exl2"
    fp8 = "fp8"
--- a/server/text_generation_server/layers/attention/cuda.py
+++ b/server/text_generation_server/layers/attention/cuda.py
@ -108,7 +108,7 @@ def paged_attention(
        if softcap is not None:
            raise RuntimeError("Paged attention doesn't support softcapping")
        input_lengths = seqlen.input_lengths + seqlen.cache_lengths
-        from vllm._C import ops
+        import attention_kernels
        out = torch.empty_like(query)
@ -116,7 +116,7 @@ def paged_attention(
            max_num_partitions == 1 or num_seqs * num_heads > 512
        )
        if use_v1:
-            ops.paged_attention_v1(
+            attention_kernels.paged_attention_v1(
                out,
                query,
                kv_cache.key,
@ -146,7 +146,7 @@ def paged_attention(
            )
            max_logits = torch.empty_like(exp_sums)
-            ops.paged_attention_v2(
+            attention_kernels.paged_attention_v2(
                out,
                exp_sums,
                max_logits,
--- a/server/text_generation_server/layers/attention/kv_cache.py
+++ b/server/text_generation_server/layers/attention/kv_cache.py
@ -200,12 +200,12 @@ def paged_reshape_and_cache(
 ):
    if SYSTEM == "cuda":
        try:
-            from vllm._C import cache_ops
+            import attention_kernels
        except Exception as e:
            raise ImportError(
-                f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}"
+                f"Could not import attention_kernels. Make sure your installation is correct. Complete error: {e}"
            )
-        cache_ops.reshape_and_cache(
+        attention_kernels.reshape_and_cache(
            key, value, key_cache, value_cache, slots, "auto", 1.0
        )
    elif SYSTEM == "rocm":
--- a/server/text_generation_server/layers/compressed_tensors/init.py
+++ b/server/text_generation_server/layers/compressed_tensors/init.py
@ -0,0 +1,3 @@
 from .loader import CompressedTensorsLoader
 __all__ = ["CompressedTensorsLoader"]
--- a/server/text_generation_server/layers/compressed_tensors/loader.py
+++ b/server/text_generation_server/layers/compressed_tensors/loader.py
@ -0,0 +1,174 @@
 from typing import Any, Dict, List, Union
 from compressed_tensors import QuantizationConfig, QuantizationStatus
 from compressed_tensors.config import CompressionFormat
 from compressed_tensors.quantization import (
    QuantizationScheme,
    QuantizationType,
    find_name_or_class_matches,
 )
 from loguru import logger
 from pydantic import ValidationError
 from torch import nn
 from text_generation_server.layers.compressed_tensors.w8an_fp import W8ANFpLoader
 from text_generation_server.layers.compressed_tensors.wna16_int import WNA16Loader
 from text_generation_server.utils.log import log_once
 from text_generation_server.utils.weights import (
    DefaultWeightsLoader,
    UnquantizedWeight,
    Weights,
    WeightsLoader,
 )
 # compressed-tensors can match modules as quantization targets. However,
 # they need to be objects rather than classes or class names. Since we
 # need to match `Linear` targets, make an instance that can be re-used.
 _EMPTY_LINEAR: nn.Module = nn.Linear(0, 0)
 class CompressedTensorsLoader(WeightsLoader):
    """Loader for checkpoints stored in the compressed-tensors format."""
    def __init__(self, config: Dict[str, Any]):
        quantization_config_raw = config.get("quantization_config")
        if quantization_config_raw is None:
            # `compression_config` was renamed to `quantization_config`; support
            # retained for backward compatibility.
            quantization_config_raw = config.get("compression_config")
        if quantization_config_raw is None:
            raise ValueError(
                "Checkpoint does not have compressed-tensors configuration"
            )
        try:
            quantization_config = QuantizationConfig.model_validate(
                quantization_config_raw
            )
        except ValidationError as e:
            raise ValueError("Cannot parse compressed-tensors configuration") from e
        if quantization_config.quantization_status not in (
            QuantizationStatus.COMPRESSED,
            QuantizationStatus.FROZEN,
        ):
            raise ValueError(
                f"Model quantization was not finished, status was: {quantization_config.quantization_status}"
            )
        self.ignore = (
            quantization_config.ignore if quantization_config.ignore is not None else []
        )
        self.loaders = self._get_target_loaders(quantization_config)
        for target, loader in self.loaders.items():
            log_once(
                logger.info,
                f"Using {loader} for compressed-tensors target '{target}'",
            )
    def get_weights(self, weights: Weights, prefix: str):
        loader = self._lookup_loader(prefix)
        return loader.get_weights(weights, prefix)
    def get_weights_col_packed(
        self,
        weights: "Weights",
        prefix: str,
        block_sizes: Union[int, List[int]],
    ):
        loader = self._lookup_loader(prefix)
        return loader.get_weights_col_packed(weights, prefix, block_sizes)
    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
        loader = self._lookup_loader(prefixes[0])
        return loader.get_multi_weights_col(weights, prefixes, dim)
    def get_weights_row(self, weights: Weights, prefix: str):
        loader = self._lookup_loader(prefix)
        return loader.get_weights_row(weights, prefix)
    def _get_target_loaders(
        self, quantization_config: QuantizationConfig
    ) -> Dict[str, WeightsLoader]:
        """
        A compressed-tensors checkpoint can use different quantizations
        for different targets. This method returns a dictionary with a
        loader per target.
        """
        loaders: Dict[str, WeightsLoader] = {}
        format = quantization_config.format
        for group_name, group in quantization_config.config_groups.items():
            # The group configuration can be a string, but does that ever
            # happen in a serialized quantization config?
            assert isinstance(group, QuantizationScheme)
            loader = self._create_loader_for_group(format, group_name, group)
            # A quantized parameter group can have multiple targets, add the
            # loader for all the targets.
            for target in group.targets:
                if target in loaders:
                    raise ValueError(
                        f"Target '{target} has multiple configured loaders'"
                    )
                loaders[target] = loader
        return loaders
    def _create_loader_for_group(
        self, format: str, group_name: str, group: QuantizationScheme
    ) -> WeightsLoader:
        """
        Find and create a loader for the group with the given quantization
        scheme.
        """
        # NOTE: we ignore group.output_activations because we don't support
        #       output quantization yet.
        input_activations = group.input_activations
        weights = group.weights
        if (
            format
            in {
                CompressionFormat.float_quantized.value,
                CompressionFormat.naive_quantized.value,
            }
            and weights is not None
            and weights.type == QuantizationType.FLOAT
            and weights.num_bits == 8
        ):
            # FP W8A8 or W8A16.
            return W8ANFpLoader(input_activations=input_activations, weights=weights)
        elif (
            format == CompressionFormat.pack_quantized.value
            and weights is not None
            and weights.type == QuantizationType.INT
            and weights.num_bits in (4, 8)
        ):
            # INT W4A16 or W8A16 (GPTQ/AWQ-like).
            return WNA16Loader(weights)
        else:
            raise ValueError(
                f"Group '{group_name}' has unsupported compressed-tensors configurtion"
            )
    def _lookup_loader(self, prefix: str) -> WeightsLoader:
        """
        Look up the loader to use for a given parameter name (prefix).
        """
        if len(find_name_or_class_matches(prefix, _EMPTY_LINEAR, self.ignore)) > 0:
            return DefaultWeightsLoader(UnquantizedWeight)
        # We currently only handle linear layers, so unconditionally pass
        # a `Linear` instance.
        targets = find_name_or_class_matches(prefix, _EMPTY_LINEAR, self.loaders.keys())
        if len(targets) == 0:
            raise ValueError(
                f"Cannot find compressed-tensors target for prefix: {prefix}"
            )
        return self.loaders[targets[0]]
--- a/server/text_generation_server/layers/compressed_tensors/w8an_fp.py
+++ b/server/text_generation_server/layers/compressed_tensors/w8an_fp.py
@ -0,0 +1,174 @@
 from typing import List, Optional, Union
 import torch
 from compressed_tensors.quantization import QuantizationArgs, QuantizationType
 from text_generation_server.layers.fp8 import Fp8Weight, _load_scalar_or_matrix_scale
 from text_generation_server.utils.weights import Weights, WeightsLoader
 class W8ANFpLoader(WeightsLoader):
    """
    Loader for W8A8/W8A16 FP compressed-tensors parameters.
    """
    def __init__(
        self,
        *,
        input_activations: Optional[QuantizationArgs],
        weights: QuantizationArgs,
    ):
        assert weights.type == QuantizationType.FLOAT and weights.num_bits == 8
        # We ignore the `strategy` option which sets the scales to be
        # per-tensor, per-channel or per-token. What scales are supported
        # is dependent on the kernels used (e.g. cutlass can do tokenwise,
        # Torch cannot, and FP8-Marlin does not quantize inputs at all).
        # So, instead we try to use the best-possible configuration.
        self.load_weight_scale = not weights.dynamic
        self.load_input_scale = (
            input_activations is not None and not input_activations.dynamic
        )
        self.force_w8a16 = (
            input_activations is not None and input_activations.num_bits == 16
        )
    def __str__(self) -> str:
        def scale_to_str(scale):
            return "static" if scale else "dynamic"
        quantization_type = f"W8A{16 if self.force_w8a16 else 8}"
        return f"{self.__class__.__name__} ({quantization_type}, weight: {scale_to_str(self.load_weight_scale)}, input: {scale_to_str(self.load_input_scale)})"
    def get_weights(self, weights: "Weights", prefix: str):
        w = weights.get_tensor(f"{prefix}.weight")
        weight_scale = None
        if self.load_weight_scale:
            weight_scale = (
                weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
                .reshape(-1)
                .expand(w.shape[0])
            )
        input_scale = None
        if self.load_input_scale:
            input_scale = weights.get_tensor(
                f"{prefix}.input_scale", to_dtype=False
            ).reshape(-1)
        return Fp8Weight(
            weight=w,
            weight_scale=weight_scale,
            input_scale=input_scale,
            dtype=weights.dtype,
            force_w8a16=self.force_w8a16,
        )
    def get_weights_col_packed(
        self,
        weights: Weights,
        prefix: str,
        block_sizes: Union[int, List[int]],
    ):
        w = weights.get_packed_sharded(
            f"{prefix}.weight", dim=0, block_sizes=block_sizes
        )
        weight_scale = None
        if self.load_weight_scale:
            weight_scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
            if weight_scale.numel() > 1:
                weight_scale = weights.get_packed_sharded(
                    f"{prefix}.weight_scale",
                    dim=0,
                    block_sizes=block_sizes,
                    to_dtype=False,
                )
            weight_scale = weight_scale.reshape(-1).expand(w.shape[0])
        input_scale = None
        if self.load_input_scale:
            input_scale = weights.get_tensor(f"{prefix}.input_scale", to_dtype=False)
            if input_scale.numel() > 1:
                input_scale = weights.get_packed_sharded(
                    f"{prefix}.input_scale",
                    dim=0,
                    block_sizes=block_sizes,
                    to_dtype=False,
                )
            input_scale = input_scale.reshape(-1).max()
        return Fp8Weight(
            weight=w,
            weight_scale=weight_scale,
            input_scale=input_scale,
            dtype=weights.dtype,
            force_w8a16=self.force_w8a16,
        )
    def get_multi_weights_col(self, weights: "Weights", prefixes: List[str], dim: int):
        # FIXME: Force to_device to false as fp8 weights do not support torch.cat on device yet
        w = [
            weights.get_sharded(f"{p}.weight", dim=0, to_device=False) for p in prefixes
        ]
        shapes = [x.shape for x in w]
        # Concat then send to the device
        w = torch.cat(w, dim=dim).to(weights.device)
        weight_scale = None
        if self.load_weight_scale:
            weight_scale = [
                _load_scalar_or_matrix_scale(weights, f"{p}.weight_scale", shape)
                for p, shape in zip(prefixes, shapes)
            ]
            weight_scale = torch.cat(weight_scale, dim=0).reshape(-1)
        input_scale = None
        if self.load_input_scale:
            input_scale = [
                _load_scalar_or_matrix_scale(weights, f"{p}.input_scale", shape)
                for p, shape in zip(prefixes, shapes)
                if weights.has_tensor(f"{p}.input_scale")
            ]
            assert len(input_scale) == 0 or len(input_scale) == len(prefixes)
            input_scale = (
                torch.cat(input_scale, dim=0).reshape(-1).max()
                if len(input_scale) != 0
                else None
            )
        return Fp8Weight(
            weight=w,
            weight_scale=weight_scale,
            input_scale=input_scale,
            dtype=weights.dtype,
            force_w8a16=self.force_w8a16,
        )
    def get_weights_row(self, weights: "Weights", prefix: str):
        w = weights.get_sharded(f"{prefix}.weight", dim=1)
        weight_scale = None
        if self.load_weight_scale:
            weight_scale = (
                weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
                .reshape(-1)
                .expand(w.shape[0])
            )
        input_scale = None
        if self.load_input_scale:
            input_scale = weights.get_tensor(
                f"{prefix}.input_scale", to_dtype=False
            ).reshape(-1)
        return Fp8Weight(
            weight=w,
            weight_scale=weight_scale,
            input_scale=input_scale,
            dtype=weights.dtype,
            force_w8a16=self.force_w8a16,
        )
--- a/server/text_generation_server/layers/compressed_tensors/wna16_int.py
+++ b/server/text_generation_server/layers/compressed_tensors/wna16_int.py
@ -0,0 +1,188 @@
 from typing import List, Union
 import torch
 from compressed_tensors.quantization import ActivationOrdering, QuantizationArgs
 from loguru import logger
 from text_generation_server.layers.marlin.gptq import repack_gptq_for_marlin
 from text_generation_server.utils.log import log_once
 from text_generation_server.utils.weights import Weights, WeightsLoader
 class WNA16Loader(WeightsLoader):
    """
    Loader for W4A16/W8A16 INT compressed-tensors parameters.
    """
    def __init__(self, weights: QuantizationArgs):
        self.weights = weights
        self.desc_act = self.weights.actorder == ActivationOrdering.GROUP
        self.groupsize = (
            -1 if self.weights.group_size is None else self.weights.group_size
        )
    def __str__(self) -> str:
        quantization_type = f"W{self.weights.num_bits}8A16"
        return f"{self.__class__.__name__} ({quantization_type})"
    def get_weights(self, weights: Weights, prefix: str):
        log_once(logger.info, "Using GPTQ-Marlin kernels")
        try:
            weight_packed = weights.get_tensor(f"{prefix}.weight_packed").t()
        except RuntimeError:
            raise RuntimeError(
                f"Cannot load w{self.weights.num_bits}a16 weight, make sure the model is already quantized"
            )
        zero_point = None
        if not self.weights.symmetric:
            zero_point = weights.get_tensor(f"{prefix}.weight_zero_point").t()
        g_idx = None
        if self.desc_act:
            g_idx = weights.get_tensor(f"{prefix}.weight_g_idx")
        scales = weights.get_tensor(f"{prefix}.weight.scales").t()
        return repack_gptq_for_marlin(
            qweight=weight_packed.contiguous(),
            scales=scales,
            qzeros=zero_point,
            g_idx=g_idx,
            bits=self.weights.num_bits,
            desc_act=self.desc_act,
            groupsize=self.groupsize,
            quant_method="compressed-tensors",
            sym=self.weights.symmetric,
            sharded_infeatures=False,
        )
    def get_weights_col_packed(
        self,
        weights: Weights,
        prefix: str,
        block_sizes: Union[int, List[int]],
    ):
        try:
            weight_packed = weights.get_packed_sharded(
                f"{prefix}.weight_packed", dim=0, block_sizes=block_sizes
            ).t()
        except RuntimeError:
            raise RuntimeError(
                f"Cannot load w{self.weights.num_bits}a16 weight, make sure the model is already quantized"
            )
        scales = weights.get_packed_sharded(
            f"{prefix}.weight_scale", dim=0, block_sizes=block_sizes
        ).t()
        scales = scales.to(dtype=weights.dtype)
        zero_point = None
        if not self.weights.symmetric:
            zero_point = weights.get_packed_sharded(
                f"{prefix}.qzeros", dim=0, block_sizes=block_sizes
            ).t()
        g_idx = None
        if self.desc_act:
            g_idx = weights.get_tensor(f"{prefix}.g_idx")
        return repack_gptq_for_marlin(
            qweight=weight_packed.contiguous(),
            scales=scales,
            qzeros=zero_point,
            g_idx=g_idx,
            bits=self.weights.num_bits,
            desc_act=self.desc_act,
            groupsize=self.groupsize,
            quant_method="compressed-tensors",
            sym=self.weights.symmetric,
            sharded_infeatures=False,
        )
    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
        try:
            weight_packed = torch.cat(
                [
                    weights.get_sharded(f"{p}.weight_packed", dim=0).t()
                    for p in prefixes
                ],
                dim=1,
            )
        except RuntimeError:
            raise RuntimeError(
                f"Cannot load w{self.weights.num_bits}a16 weight, make sure the model is already quantized"
            )
        scales = torch.cat(
            [weights.get_sharded(f"{p}.weight_scale", dim=0).t() for p in prefixes],
            dim=1,
        )
        zero_point = None
        if not self.weights.symmetric:
            zero_point = torch.cat(
                [weights.get_sharded(f"{p}.qzeros", dim=0).t() for p in prefixes], dim=1
            ).t()
        g_idx = None
        if self.desc_act:
            w = [weights.get_tensor(f"{p}.g_idx") for p in prefixes]
            for w2 in w[1:]:
                torch.testing.assert_close(w2, w[0])
            g_idx = w[0]
        return repack_gptq_for_marlin(
            qweight=weight_packed.contiguous(),
            scales=scales,
            qzeros=zero_point,
            g_idx=g_idx,
            bits=self.weights.num_bits,
            desc_act=self.desc_act,
            groupsize=self.groupsize,
            quant_method="compressed-tensors",
            sym=self.weights.symmetric,
            sharded_infeatures=False,
        )
    def get_weights_row(self, weights: Weights, prefix: str):
        log_once(logger.info, "Using GPTQ-Marlin kernels")
        try:
            weight_packed = weights.get_sharded(f"{prefix}.weight_packed", dim=1).t()
        except RuntimeError:
            raise RuntimeError(
                f"Cannot load `{self.quantize}` weight, make sure the model is already quantized."
            )
        zero_point = None
        if not self.weights.symmetric:
            if self.desc_act or self.groupsize == -1:
                zero_point = weights.get_tensor(f"{prefix}.weight_zero_point").t()
            else:
                zero_point = weights.get_sharded(
                    f"{prefix}.weight_zero_point", dim=1
                ).t()
        g_idx = None
        if self.desc_act:
            g_idx = weights.get_sharded(f"{prefix}.g_idx", dim=0)
        if self.desc_act or self.groupsize == -1:
            scales = weights.get_tensor(f"{prefix}.weight_scale").t()
        else:
            scales = weights.get_sharded(f"{prefix}.weight_scale", dim=1).t()
        sharded_in_features = weights.process_group.size() > 1
        return repack_gptq_for_marlin(
            qweight=weight_packed.contiguous(),
            scales=scales,
            qzeros=zero_point,
            g_idx=g_idx,
            bits=self.weights.num_bits,
            desc_act=self.desc_act,
            groupsize=self.groupsize,
            quant_method="compressed-tensors",
            sym=self.weights.symmetric,
            sharded_infeatures=sharded_in_features,
        )
--- a/server/text_generation_server/layers/fp8.py
+++ b/server/text_generation_server/layers/fp8.py
@ -29,7 +29,7 @@ else:
    CUTLASS_FP8_AVAILABLE = False
-def get_fp8_linear() -> Type[torch.nn.Module]:
+def get_fp8_linear(force_w8a16: bool = False) -> Type[torch.nn.Module]:
    """
    Return an FP8 linear `Module` that is compatible with the current system.
    """
@ -37,7 +37,14 @@ def get_fp8_linear() -> Type[torch.nn.Module]:
    if SYSTEM == "cuda":
        major, _ = torch.cuda.get_device_capability()
-        if major == 8 and os.getenv("USE_CUTLASS_W8A8", "0") != "1":
+        # Marlin is W8A16, use it when:
        #
        # - On capability 8.x where x < 8: W8A8 FP8 GEMM is not supported.
        # - On capability 8.9: W8A8 FP8 GEMM is supported, but Marlin-FP8 is faster.
        # - On capability 9.x when force_w8a16: cutlass kernels do not support W8A16.
        if (major == 8 or (major == 9 and force_w8a16)) and os.getenv(
            "USE_CUTLASS_W8A8", "0"
        ) != "1":
            # NOTE: Capability 8.9 is supported by cutlass kernels, but FP8-Marlin
            #       gives better decoding throughput on L4 and L40.
            from text_generation_server.layers.marlin import GPTQMarlinFP8Linear
@ -283,14 +290,17 @@ class Fp8Weight(Weight):
    weight_scale: Optional[torch.Tensor] = None
    input_scale: Optional[torch.Tensor] = None
    activation_scale_ub: Optional[float] = None
    force_w8a16: bool = False
    def get_linear(self, bias: torch.Tensor):
        if self.weight_scale is None:
-            return get_fp8_linear().from_unquant(self.weight, bias, self.dtype)
+            return get_fp8_linear(force_w8a16=self.force_w8a16).from_unquant(
                self.weight, bias, self.dtype
            )
        # This is not checked by the fbgemm kernels, but they require contiguous
        # memory. Can be non-contiguous when we e.g. expand from scalars.
        self.weight_scale = self.weight_scale.contiguous()
-        return get_fp8_linear().from_fp8(
+        return get_fp8_linear(force_w8a16=self.force_w8a16).from_fp8(
            weight=self.weight,
            scale=self.weight_scale,
            dtype=self.dtype,
--- a/server/text_generation_server/layers/marlin/gptq.py
+++ b/server/text_generation_server/layers/marlin/gptq.py
@ -261,7 +261,7 @@ class GPTQMarlinWeight(Weight):
    def __post_init__(self):
        assert self.qweight.dtype == torch.int32
-        assert self.scales.dtype == torch.float16
+        assert self.scales.dtype in (torch.float16, torch.bfloat16)
        assert self.g_idx.dtype == torch.int32
        assert self.perm.dtype == torch.int32
@ -300,7 +300,7 @@ def repack_gptq_for_marlin(
        raise RuntimeError(
            f"Repacking GPTQ weights with group size {groupsize} as Marlin is not supported, must be one of: {supported_sizes}"
        )
-    if not (sym or quant_method == "awq"):
+    if not (sym or quant_method == "awq" or quant_method == "compressed-tensors"):
        raise RuntimeError(
            "Repacking GPTQ weights with asymmetric quantization as Marlin is not supported."
        )
--- a/server/text_generation_server/models/init.py
+++ b/server/text_generation_server/models/init.py
@ -370,46 +370,23 @@ def get_model(
    compression_config = config_dict.get("compression_config", None)
    if quantization_config is not None and quantize is None:
        method = quantization_config.get("quant_method", None)
        config_groups = quantization_config.get("config_groups", None)
        if method in {"gptq", "awq", "exl2"}:
            log_master(logger.info, f"Auto selecting quantization method {method}")
            quantize = method
        elif method == "fbgemm_fp8" or method == "fp8":
            log_master(logger.info, "Auto selecting quantization method fp8")
            quantize = "fp8"
-        elif config_groups is not None:
+        if method == "compressed-tensors":
-            # TODO: at some point we should probably fully parse the compression
+            log_master(
-            # configuration to know which parameters are compressed.
+                logger.info, "Auto selecting quantization method compressed-tensors"
-            for _, group in config_groups.items():
+            )
-                weights_config = group.get("weights")
+            quantize = "compressed-tensors"
                if weights_config is not None:
                    if (
                        weights_config["type"] == "float"
                        and weights_config["num_bits"] == 8
                    ):
                        log_master(
                            logger.info, "Auto selecting quantization method fp8"
                        )
                        quantize = "fp8"
                        break
        else:
            log_master(logger.warning, f"Unknown quantization method {method}")
    elif compression_config is not None:
        # `compression_config` renamed to `quantization_config`; support retained for backward compatibility.
-        config_groups = compression_config.get("config_groups")
+        log_master(logger.info, "Auto selecting quantization method compressed-tensors")
-        if config_groups is not None:
+        quantize = "compressed-tensors"
            for _, group in config_groups.items():
                weights_config = group.get("weights")
                if weights_config is not None:
                    if (
                        weights_config["type"] == "float"
                        and weights_config["num_bits"] == 8
                    ):
                        log_master(
                            logger.info, "Auto selecting quantization method fp8"
                        )
                        quantize = "fp8"
                        break
    if dtype is None:
        if quantize in ["awq", "exl2", "gptq", "marlin"]:
@ -559,7 +536,7 @@ def get_model(
        # TODO: fix how we determine model type for Mamba
        if "ssm_cfg" in config_dict:
            # *only happens in Mamba case
-            model_type = "ssm"
+            model_type = "mamba"
        else:
            raise RuntimeError(
                f"Could not determine model type for {model_id} revision {revision}"
--- a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
@ -23,8 +23,10 @@ from typing import Optional, List, Tuple, Any
 from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.utils.import_utils import SYSTEM
-if SYSTEM != "ipex":
+if SYSTEM == "rocm":
    from vllm.model_executor.layers.fused_moe import fused_moe
 elif SYSTEM != "ipex":
    from moe_kernels.fused_moe import fused_moe
 else:
    from intel_extension_for_pytorch.llm.modules import GatedMLPMOE
--- a/server/text_generation_server/models/custom_modeling/mamba_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/mamba_modeling.py
@ -212,7 +212,7 @@ class MambaModel(nn.Module):
        try:
            self.lm_head = SpeculativeHead.load(config, f"{prefix}.embeddings", weights)
        except RuntimeError:
-            self.lm_head = SpeculativeHead.load(config, f"{prefix}.embeddings", weights)
+            self.lm_head = SpeculativeHead.load(config, f"{prefix}.embedding", weights)
        self.config = config
    def forward(
--- a/server/text_generation_server/utils/logits_process.py
+++ b/server/text_generation_server/utils/logits_process.py
@ -5,7 +5,7 @@ from loguru import logger
 from typing import Dict, Union
 from text_generation_server.pb.generate_pb2 import GrammarType
-from outlines.fsm.fsm import RegexFSM
+from outlines.fsm.guide import RegexGuide
 from outlines.fsm.json_schema import build_regex_from_schema
 from functools import lru_cache
 from typing import List, Optional, DefaultDict
@ -482,7 +482,7 @@ class HeterogeneousProcessorWrapper(LogitsProcessor):
 class GrammarLogitProcessor(LogitsProcessor):
    fsm_state: DefaultDict[int, int]
-    fsm: RegexFSM
+    fsm: RegexGuide
    def __init__(self, tokenizer, device, grammar, grammar_type):
        self.device = device
@ -498,9 +498,10 @@ class GrammarLogitProcessor(LogitsProcessor):
    ):
        if fsm_grammar_state == -1 or self.fsm is None:
            return logits
-        allowed_tokens = self.fsm.allowed_token_ids(fsm_grammar_state)
+        allowed_tokens = self.fsm.get_next_instruction(fsm_grammar_state).tokens
        mask = torch.full_like(logits, -math.inf)
-        mask[:, allowed_tokens] = 0
+        if allowed_tokens is not None:
            mask[:, allowed_tokens] = 0
        biased_scores = logits + mask
        return biased_scores
@ -513,7 +514,7 @@ class GrammarLogitProcessor(LogitsProcessor):
    def _advance(next_token_id, fsm_grammar_state, fsm):
        if fsm_grammar_state == -1:
            return fsm_grammar_state
-        return fsm.next_state(fsm_grammar_state, next_token_id)
+        return fsm.get_next_state(fsm_grammar_state, next_token_id)
    # TODO: move grammar compilation into the router
    @staticmethod
@ -530,7 +531,7 @@ class GrammarLogitProcessor(LogitsProcessor):
                schema = "(.*?)"
        elif grammar_type == GrammarType.GRAMMAR_TYPE_REGEX:
            pass  # schema is already a regex just here for clarity
-        fsm = RegexFSM(schema, tokenizer)
+        fsm = RegexGuide.from_regex(schema, tokenizer)
        logger.debug(f"Compiled FSM in {time.time() - start_time:.2f}s")
        return fsm
@ -588,8 +589,9 @@ class HeterogeneousGrammarLogitProcessor(LogitsProcessor):
            fsm = self.fsms[i]
            if fsm_grammar_states[i] == -1 or fsm is None:
                continue
-            allowed_tokens = fsm.allowed_token_ids(fsm_grammar_states[i])
+            allowed_tokens = fsm.get_next_instruction(fsm_grammar_states[i]).tokens
-            mask[i, allowed_tokens] = 0
+            if allowed_tokens is not None:
                mask[i, allowed_tokens] = 0
            logits[i] += mask[i]
        return logits
--- a/server/text_generation_server/utils/quantization.py
+++ b/server/text_generation_server/utils/quantization.py
@ -27,7 +27,20 @@ class _FP8QuantizerConfig:
    activation_scale_ub: float
-# We should probably do this with Pytantic JSON deserialization,
+def _get_config_json(model_id: str, revision: Optional[str], filename: str):
    if os.path.exists(
        os.path.join(
            model_id,
        )
    ):
        filename = os.path.join(model_id, filename)
    else:
        filename = hf_hub_download(model_id, filename=filename, revision=revision)
    with open(filename, "r") as f:
        return json.load(f)
 # We should probably do this with Pydantic JSON deserialization,
 # but for now we'll stay close to the old _set_gptq_params.
 def _get_quantizer_config(model_id, revision):
    bits = 4
@ -39,12 +52,7 @@ def _get_quantizer_config(model_id, revision):
    filename = "config.json"
    try:
-        if os.path.exists(os.path.join(model_id, filename)):
+        data = _get_config_json(model_id, revision, filename)
            filename = os.path.join(model_id, filename)
        else:
            filename = hf_hub_download(model_id, filename=filename, revision=revision)
        with open(filename, "r") as f:
            data = json.load(f)
        # FP8 config
        if data["quantization_config"]["quant_method"] == "fbgemm_fp8":
@ -67,14 +75,7 @@ def _get_quantizer_config(model_id, revision):
    except Exception:
        filename = "quantize_config.json"
        try:
-            if os.path.exists(os.path.join(model_id, filename)):
+            data = _get_config_json(model_id, revision, filename)
                filename = os.path.join(model_id, filename)
            else:
                filename = hf_hub_download(
                    model_id, filename=filename, revision=revision
                )
            with open(filename, "r") as f:
                data = json.load(f)
            bits = data["bits"]
            groupsize = data["group_size"]
@ -90,14 +91,7 @@ def _get_quantizer_config(model_id, revision):
        except Exception:
            filename = "quant_config.json"
            try:
-                if os.path.exists(os.path.join(model_id, filename)):
+                data = _get_config_json(model_id, revision, filename)
                    filename = os.path.join(model_id, filename)
                else:
                    filename = hf_hub_download(
                        model_id, filename=filename, revision=revision
                    )
                with open(filename, "r") as f:
                    data = json.load(f)
                bits = data["w_bit"]
                groupsize = data["q_group_size"]
                desc_act = data["desc_act"]
@ -119,6 +113,14 @@ def _get_quantizer_config(model_id, revision):
 def get_loader(
    quantize: Optional[str], model_id: str, revision: Optional[str]
 ) -> WeightsLoader:
    if quantize == "compressed-tensors":
        config = _get_config_json(model_id, revision, "config.json")
        from text_generation_server.layers.compressed_tensors import (
            CompressedTensorsLoader,
        )
        return CompressedTensorsLoader(config)
    quantizer_config = _get_quantizer_config(model_id, revision)
    if quantize in {"awq", "gptq"}:
        from text_generation_server.layers.gptq import GPTQWeightsLoader
		`@ -0,0 +1,3 @@`
							`from .loader import CompressedTensorsLoader`

							`__all__ = ["CompressedTensorsLoader"]`