Remove vLLM dependency for CUDA (#2751)

* Remove vLLM dependency for CUDA This change adds `attention-kernels` as a dependency for paged attention and cache reshaping. With that, we don't use vLLM anywhere for CUDA. Tested run (since we don't have paged attention in CI): ``` ❯ ATTENTION=paged python -m pytest integration-tests -k "llama and awq" --release [...] 5 snapshots passed. ``` * Fix clippy warning
2024-11-17 17:34:50 +01:00 · 2024-11-17 17:34:50 +01:00 · 52e48739a5
parent 6489f85269
commit 52e48739a5
12 changed files with 106 additions and 44 deletions
--- a/16
+++ b/16
@ -161,18 +161,6 @@ COPY server/custom_kernels/ .
 # Build specific version of transformers
 RUN python setup.py build

-# Build vllm CUDA kernels
-FROM kernel-builder AS vllm-builder
-
-WORKDIR /usr/src
-
-ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
-
-COPY server/Makefile-vllm Makefile
-
-# Build specific version of vllm
-RUN make build-vllm-cuda
-
 # Build mamba kernels
 FROM kernel-builder AS mamba-builder
 WORKDIR /usr/src
@ -230,8 +218,6 @@ COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86
 COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from lorax punica kernels builder
 COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-# Copy build artifacts from vllm builder
-COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from mamba builder
 COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages
 COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages
@ -247,7 +233,7 @@ COPY server/Makefile server/Makefile
 RUN cd server && \
    make gen-server && \
    pip install -r requirements_cuda.txt && \
-    pip install ".[bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
+    pip install ".[attention, bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
    pip install nvidia-nccl-cu12==2.22.3

 ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
--- a/flake.lock
+++ b/flake.lock
@ -978,16 +978,15 @@
        "nixpkgs": "nixpkgs_6"
      },
      "locked": {
-        "lastModified": 1731601436,
-        "narHash": "sha256-PJmXLyz06XnLG3wB5vRLgeJXoVvpuCx6c70khYv6J1o=",
+        "lastModified": 1731674227,
+        "narHash": "sha256-k/ur37KSc+RXcwwz0tgxeamz6wQ5rsOe5hMepzIdD2s=",
        "owner": "huggingface",
        "repo": "text-generation-inference-nix",
-        "rev": "9510f57282795d6e0dbbd163d2b77a6b5bb52566",
+        "rev": "407b9e22a0b7121bf6e171d67ce0144e3f3e39bf",
        "type": "github"
      },
      "original": {
        "owner": "huggingface",
-        "ref": "nixpkgs-update-20241114",
        "repo": "text-generation-inference-nix",
        "type": "github"
      }
--- a/flake.nix
+++ b/flake.nix
@ -5,7 +5,7 @@
      inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
    };
    nix-filter.url = "github:numtide/nix-filter";
-    tgi-nix.url = "github:huggingface/text-generation-inference-nix/nixpkgs-update-20241114";
+    tgi-nix.url = "github:huggingface/text-generation-inference-nix";
    nixpkgs.follows = "tgi-nix/nixpkgs";
    flake-utils.url = "github:numtide/flake-utils";
    rust-overlay = {
--- a/nix/server.nix
+++ b/nix/server.nix
@ -3,6 +3,7 @@
  buildPythonPackage,
  poetry-core,
  mypy-protobuf,
+  attention-kernels,
  awq-inference-engine,
  causal-conv1d,
  compressed-tensors,
@ -27,15 +28,18 @@
  opentelemetry-exporter-otlp,
  opentelemetry-instrumentation-grpc,
  opentelemetry-semantic-conventions,
+  outlines,
  peft,
+  prometheus-client,
  punica-kernels,
+  py-cpuinfo,
+  pydantic,
  safetensors,
  tokenizers,
  torch,
  sentencepiece,
  transformers,
  typer,
-  vllm,
 }:

 let
@ -72,6 +76,7 @@ buildPythonPackage {
  pythonRemoveDeps = [ "scipy" ];

  dependencies = [
+    attention-kernels
    awq-inference-engine
    eetq
    causal-conv1d
@ -95,14 +100,17 @@ buildPythonPackage {
    opentelemetry-exporter-otlp
    opentelemetry-instrumentation-grpc
    opentelemetry-semantic-conventions
+    outlines
    peft
+    prometheus-client
    punica-kernels
+    py-cpuinfo
+    pydantic
    safetensors
    sentencepiece
    tokenizers
    transformers
    typer
-    vllm
  ];

  prePatch = ''
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@ -22,6 +22,7 @@ use tracing::warn;
 use utoipa::ToSchema;
 use validation::Validation;

+#[allow(clippy::large_enum_variant)]
 #[derive(Clone)]
 pub enum Tokenizer {
    Python {
--- a/server/Makefile
+++ b/server/Makefile
@ -29,8 +29,8 @@ install-server: gen-server
 install: install-cuda
 	echo "Installed server"

-install-cuda: install-server install-flash-attention-v2-cuda install-vllm-cuda install-flash-attention
-	pip install -e ".[bnb,marlin,moe]"
+install-cuda: install-server install-flash-attention-v2-cuda install-flash-attention
+	pip install -e ".[attention,bnb,marlin,moe]"
 	pip install nvidia-nccl-cu12==2.22.3

 install-rocm: install-server install-flash-attention-v2-rocm  install-vllm-rocm
--- a/server/Makefile-vllm
+++ b/server/Makefile-vllm
@ -1,14 +1,4 @@
-commit_cuda := d243e9dc7e2c9c2e36a4150ec8e64809cb55c01b
 commit_rocm := 4e0929e6e4fa0a3d09d358715c288020ea9dc247
-build-vllm-cuda:
-	if [ ! -d 'vllm' ]; then \
-		pip install -U ninja packaging --no-cache-dir && \
-		git clone https://github.com/Narsil/vllm.git vllm; \
-	fi
-	cd vllm  && git fetch origin && git checkout $(commit_cuda) && python setup.py build
-
-install-vllm-cuda: build-vllm-cuda
-	cd vllm  && git fetch origin && git checkout $(commit_cuda) && pip install -e .

 build-vllm-rocm:
 	if [ ! -d 'vllm' ]; then \
--- a/server/poetry.lock
+++ b/server/poetry.lock
@ -200,6 +200,74 @@ files = [
    {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"},
 ]

+[[package]]
+name = "attention-kernels"
+version = "0.1.1"
+description = "Attention kernels"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:812851d4ce0f54ca764ff3815a731b15f0cb110115d0aa2d0997cd7794d808bb"},
+]
+
+[package.dependencies]
+torch = "*"
+
+[package.source]
+type = "url"
+url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl"
+
+[[package]]
+name = "attention-kernels"
+version = "0.1.1"
+description = "Attention kernels"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:614c402621b11dd1f5741a016b9fd27cb6a68814471f2048bc05206923516268"},
+]
+
+[package.dependencies]
+torch = "*"
+
+[package.source]
+type = "url"
+url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl"
+
+[[package]]
+name = "attention-kernels"
+version = "0.1.1"
+description = "Attention kernels"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:6b2ca7c98997431d5f6c4af7553dce6b1bff8dfdec374c97c6ffba71325a02b7"},
+]
+
+[package.dependencies]
+torch = "*"
+
+[package.source]
+type = "url"
+url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl"
+
+[[package]]
+name = "attention-kernels"
+version = "0.1.1"
+description = "Attention kernels"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:a56710c5626e461d6f628ae14b74ffc89833578ebd59c3c0c47f5d6f07461fbf"},
+]
+
+[package.dependencies]
+torch = "*"
+
+[package.source]
+type = "url"
+url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl"
+
 [[package]]
 name = "attrs"
 version = "24.2.0"
@ -3985,6 +4053,7 @@ type = ["pytest-mypy"]

 [extras]
 accelerate = ["accelerate"]
+attention = ["attention-kernels", "attention-kernels", "attention-kernels", "attention-kernels"]
 bnb = ["bitsandbytes"]
 compressed-tensors = ["compressed-tensors"]
 marlin = ["marlin-kernels", "marlin-kernels", "marlin-kernels", "marlin-kernels"]
@ -3997,4 +4066,4 @@ torch = ["torch"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.13"
-content-hash = "5d1295a8becce2f65dc68d64f200acb5832de50fc0c37392f6f87bbc5b15d32a"
+content-hash = "05add88628d836faceae1a26fde4092651a6eca74555ae38ebff879a7895be7e"
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@ -9,7 +9,7 @@ text-generation-server = 'text_generation_server.cli:app'

 [tool.poetry.dependencies]
 python = ">=3.9,<3.13"
-protobuf = "^4.25.3"
+protobuf = ">=4.25.3,<6"
 grpcio = "^1.51.1"
 grpcio-status = "^1.51.1"
 grpcio-reflection = "^1.51.1"
@ -35,12 +35,18 @@ torch = { version = "^2.4.0", optional = true }
 scipy = "^1.11.1"
 pillow = "^10.0.0"
 outlines= { version = "^0.1.1", optional = true }
-prometheus-client = "^0.20.0"
+prometheus-client = ">=0.20.0,<0.22"
 py-cpuinfo = "^9.0.0"
 compressed-tensors = { version = "^0.7.1", optional = true }
 # Remove later, temporary workaround for outlines.
 numpy = "^1.26"

+attention-kernels = [
+  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
+  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
+  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
+  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
+]
 marlin-kernels = [
  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
@ -58,6 +64,7 @@ rich = "^13.7.1"
 [tool.poetry.extras]
 torch = ["torch"]
 accelerate = ["accelerate"]
+attention = ["attention-kernels"]
 bnb = ["bitsandbytes"]
 compressed-tensors = ["compressed-tensors"]
 marlin = ["marlin-kernels"]
--- a/server/text_generation_server/layers/attention/cuda.py
+++ b/server/text_generation_server/layers/attention/cuda.py
@ -108,7 +108,7 @@ def paged_attention(
        if softcap is not None:
            raise RuntimeError("Paged attention doesn't support softcapping")
        input_lengths = seqlen.input_lengths + seqlen.cache_lengths
-        from vllm._C import ops
+        import attention_kernels

        out = torch.empty_like(query)

@ -116,7 +116,7 @@ def paged_attention(
            max_num_partitions == 1 or num_seqs * num_heads > 512
        )
        if use_v1:
-            ops.paged_attention_v1(
+            attention_kernels.paged_attention_v1(
                out,
                query,
                kv_cache.key,
@ -146,7 +146,7 @@ def paged_attention(
            )
            max_logits = torch.empty_like(exp_sums)

-            ops.paged_attention_v2(
+            attention_kernels.paged_attention_v2(
                out,
                exp_sums,
                max_logits,
--- a/server/text_generation_server/layers/attention/kv_cache.py
+++ b/server/text_generation_server/layers/attention/kv_cache.py
@ -200,12 +200,12 @@ def paged_reshape_and_cache(
 ):
    if SYSTEM == "cuda":
        try:
-            from vllm._C import cache_ops
+            import attention_kernels
        except Exception as e:
            raise ImportError(
-                f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}"
+                f"Could not import attention_kernels. Make sure your installation is correct. Complete error: {e}"
            )
-        cache_ops.reshape_and_cache(
+        attention_kernels.reshape_and_cache(
            key, value, key_cache, value_cache, slots, "auto", 1.0
        )
    elif SYSTEM == "rocm":
--- a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
@ -23,8 +23,10 @@ from typing import Optional, List, Tuple, Any
 from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.utils.import_utils import SYSTEM

-if SYSTEM != "ipex":
+if SYSTEM == "rocm":
    from vllm.model_executor.layers.fused_moe import fused_moe
+elif SYSTEM != "ipex":
+    from moe_kernels.fused_moe import fused_moe

 from text_generation_server.layers.attention import (
    paged_attention,