diff --git a/Dockerfile b/Dockerfile index 565f3779..0c08d48f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -161,18 +161,6 @@ COPY server/custom_kernels/ . # Build specific version of transformers RUN python setup.py build -# Build vllm CUDA kernels -FROM kernel-builder AS vllm-builder - -WORKDIR /usr/src - -ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" - -COPY server/Makefile-vllm Makefile - -# Build specific version of vllm -RUN make build-vllm-cuda - # Build mamba kernels FROM kernel-builder AS mamba-builder WORKDIR /usr/src @@ -230,8 +218,6 @@ COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86 COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages # Copy build artifacts from lorax punica kernels builder COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages -# Copy build artifacts from vllm builder -COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages # Copy build artifacts from mamba builder COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages @@ -247,7 +233,7 @@ COPY server/Makefile server/Makefile RUN cd server && \ make gen-server && \ pip install -r requirements_cuda.txt && \ - pip install ".[bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \ + pip install ".[attention, bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \ pip install nvidia-nccl-cu12==2.22.3 ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2 diff --git a/flake.lock b/flake.lock index dfbd7f0e..6d2ff5dc 100644 --- a/flake.lock +++ b/flake.lock @@ -978,16 +978,15 @@ "nixpkgs": "nixpkgs_6" }, "locked": { - "lastModified": 1731601436, - "narHash": "sha256-PJmXLyz06XnLG3wB5vRLgeJXoVvpuCx6c70khYv6J1o=", + "lastModified": 1731674227, + "narHash": "sha256-k/ur37KSc+RXcwwz0tgxeamz6wQ5rsOe5hMepzIdD2s=", "owner": "huggingface", "repo": "text-generation-inference-nix", - "rev": "9510f57282795d6e0dbbd163d2b77a6b5bb52566", + "rev": "407b9e22a0b7121bf6e171d67ce0144e3f3e39bf", "type": "github" }, "original": { "owner": "huggingface", - "ref": "nixpkgs-update-20241114", "repo": "text-generation-inference-nix", "type": "github" } diff --git a/flake.nix b/flake.nix index 708ee65b..f26a983e 100644 --- a/flake.nix +++ b/flake.nix @@ -5,7 +5,7 @@ inputs.nixpkgs.follows = "tgi-nix/nixpkgs"; }; nix-filter.url = "github:numtide/nix-filter"; - tgi-nix.url = "github:huggingface/text-generation-inference-nix/nixpkgs-update-20241114"; + tgi-nix.url = "github:huggingface/text-generation-inference-nix"; nixpkgs.follows = "tgi-nix/nixpkgs"; flake-utils.url = "github:numtide/flake-utils"; rust-overlay = { diff --git a/nix/server.nix b/nix/server.nix index a96e53ac..5903a65a 100644 --- a/nix/server.nix +++ b/nix/server.nix @@ -3,6 +3,7 @@ buildPythonPackage, poetry-core, mypy-protobuf, + attention-kernels, awq-inference-engine, causal-conv1d, compressed-tensors, @@ -27,15 +28,18 @@ opentelemetry-exporter-otlp, opentelemetry-instrumentation-grpc, opentelemetry-semantic-conventions, + outlines, peft, + prometheus-client, punica-kernels, + py-cpuinfo, + pydantic, safetensors, tokenizers, torch, sentencepiece, transformers, typer, - vllm, }: let @@ -72,6 +76,7 @@ buildPythonPackage { pythonRemoveDeps = [ "scipy" ]; dependencies = [ + attention-kernels awq-inference-engine eetq causal-conv1d @@ -95,14 +100,17 @@ buildPythonPackage { opentelemetry-exporter-otlp opentelemetry-instrumentation-grpc opentelemetry-semantic-conventions + outlines peft + prometheus-client punica-kernels + py-cpuinfo + pydantic safetensors sentencepiece tokenizers transformers typer - vllm ]; prePatch = '' diff --git a/router/src/lib.rs b/router/src/lib.rs index d9cacb91..c0155852 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -22,6 +22,7 @@ use tracing::warn; use utoipa::ToSchema; use validation::Validation; +#[allow(clippy::large_enum_variant)] #[derive(Clone)] pub enum Tokenizer { Python { diff --git a/server/Makefile b/server/Makefile index 5f9f9654..b5677db8 100644 --- a/server/Makefile +++ b/server/Makefile @@ -29,8 +29,8 @@ install-server: gen-server install: install-cuda echo "Installed server" -install-cuda: install-server install-flash-attention-v2-cuda install-vllm-cuda install-flash-attention - pip install -e ".[bnb,marlin,moe]" +install-cuda: install-server install-flash-attention-v2-cuda install-flash-attention + pip install -e ".[attention,bnb,marlin,moe]" pip install nvidia-nccl-cu12==2.22.3 install-rocm: install-server install-flash-attention-v2-rocm install-vllm-rocm diff --git a/server/Makefile-vllm b/server/Makefile-vllm index 18dcc4a0..45a7980d 100644 --- a/server/Makefile-vllm +++ b/server/Makefile-vllm @@ -1,14 +1,4 @@ -commit_cuda := d243e9dc7e2c9c2e36a4150ec8e64809cb55c01b commit_rocm := 4e0929e6e4fa0a3d09d358715c288020ea9dc247 -build-vllm-cuda: - if [ ! -d 'vllm' ]; then \ - pip install -U ninja packaging --no-cache-dir && \ - git clone https://github.com/Narsil/vllm.git vllm; \ - fi - cd vllm && git fetch origin && git checkout $(commit_cuda) && python setup.py build - -install-vllm-cuda: build-vllm-cuda - cd vllm && git fetch origin && git checkout $(commit_cuda) && pip install -e . build-vllm-rocm: if [ ! -d 'vllm' ]; then \ diff --git a/server/poetry.lock b/server/poetry.lock index d03d03ae..34656816 100644 --- a/server/poetry.lock +++ b/server/poetry.lock @@ -200,6 +200,74 @@ files = [ {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"}, ] +[[package]] +name = "attention-kernels" +version = "0.1.1" +description = "Attention kernels" +optional = true +python-versions = ">=3.8" +files = [ + {file = "attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:812851d4ce0f54ca764ff3815a731b15f0cb110115d0aa2d0997cd7794d808bb"}, +] + +[package.dependencies] +torch = "*" + +[package.source] +type = "url" +url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl" + +[[package]] +name = "attention-kernels" +version = "0.1.1" +description = "Attention kernels" +optional = true +python-versions = ">=3.8" +files = [ + {file = "attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:614c402621b11dd1f5741a016b9fd27cb6a68814471f2048bc05206923516268"}, +] + +[package.dependencies] +torch = "*" + +[package.source] +type = "url" +url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl" + +[[package]] +name = "attention-kernels" +version = "0.1.1" +description = "Attention kernels" +optional = true +python-versions = ">=3.8" +files = [ + {file = "attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:6b2ca7c98997431d5f6c4af7553dce6b1bff8dfdec374c97c6ffba71325a02b7"}, +] + +[package.dependencies] +torch = "*" + +[package.source] +type = "url" +url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl" + +[[package]] +name = "attention-kernels" +version = "0.1.1" +description = "Attention kernels" +optional = true +python-versions = ">=3.8" +files = [ + {file = "attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:a56710c5626e461d6f628ae14b74ffc89833578ebd59c3c0c47f5d6f07461fbf"}, +] + +[package.dependencies] +torch = "*" + +[package.source] +type = "url" +url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl" + [[package]] name = "attrs" version = "24.2.0" @@ -3985,6 +4053,7 @@ type = ["pytest-mypy"] [extras] accelerate = ["accelerate"] +attention = ["attention-kernels", "attention-kernels", "attention-kernels", "attention-kernels"] bnb = ["bitsandbytes"] compressed-tensors = ["compressed-tensors"] marlin = ["marlin-kernels", "marlin-kernels", "marlin-kernels", "marlin-kernels"] @@ -3997,4 +4066,4 @@ torch = ["torch"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.13" -content-hash = "5d1295a8becce2f65dc68d64f200acb5832de50fc0c37392f6f87bbc5b15d32a" +content-hash = "05add88628d836faceae1a26fde4092651a6eca74555ae38ebff879a7895be7e" diff --git a/server/pyproject.toml b/server/pyproject.toml index ca65b8c8..f039ca8a 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -9,7 +9,7 @@ text-generation-server = 'text_generation_server.cli:app' [tool.poetry.dependencies] python = ">=3.9,<3.13" -protobuf = "^4.25.3" +protobuf = ">=4.25.3,<6" grpcio = "^1.51.1" grpcio-status = "^1.51.1" grpcio-reflection = "^1.51.1" @@ -35,12 +35,18 @@ torch = { version = "^2.4.0", optional = true } scipy = "^1.11.1" pillow = "^10.0.0" outlines= { version = "^0.1.1", optional = true } -prometheus-client = "^0.20.0" +prometheus-client = ">=0.20.0,<0.22" py-cpuinfo = "^9.0.0" compressed-tensors = { version = "^0.7.1", optional = true } # Remove later, temporary workaround for outlines. numpy = "^1.26" +attention-kernels = [ + { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true }, + { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true }, + { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true }, + { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true }, +] marlin-kernels = [ { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true }, { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true }, @@ -58,6 +64,7 @@ rich = "^13.7.1" [tool.poetry.extras] torch = ["torch"] accelerate = ["accelerate"] +attention = ["attention-kernels"] bnb = ["bitsandbytes"] compressed-tensors = ["compressed-tensors"] marlin = ["marlin-kernels"] diff --git a/server/text_generation_server/layers/attention/cuda.py b/server/text_generation_server/layers/attention/cuda.py index d705afb0..3038602e 100644 --- a/server/text_generation_server/layers/attention/cuda.py +++ b/server/text_generation_server/layers/attention/cuda.py @@ -108,7 +108,7 @@ def paged_attention( if softcap is not None: raise RuntimeError("Paged attention doesn't support softcapping") input_lengths = seqlen.input_lengths + seqlen.cache_lengths - from vllm._C import ops + import attention_kernels out = torch.empty_like(query) @@ -116,7 +116,7 @@ def paged_attention( max_num_partitions == 1 or num_seqs * num_heads > 512 ) if use_v1: - ops.paged_attention_v1( + attention_kernels.paged_attention_v1( out, query, kv_cache.key, @@ -146,7 +146,7 @@ def paged_attention( ) max_logits = torch.empty_like(exp_sums) - ops.paged_attention_v2( + attention_kernels.paged_attention_v2( out, exp_sums, max_logits, diff --git a/server/text_generation_server/layers/attention/kv_cache.py b/server/text_generation_server/layers/attention/kv_cache.py index 9d739da5..cad1d98a 100644 --- a/server/text_generation_server/layers/attention/kv_cache.py +++ b/server/text_generation_server/layers/attention/kv_cache.py @@ -200,12 +200,12 @@ def paged_reshape_and_cache( ): if SYSTEM == "cuda": try: - from vllm._C import cache_ops + import attention_kernels except Exception as e: raise ImportError( - f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}" + f"Could not import attention_kernels. Make sure your installation is correct. Complete error: {e}" ) - cache_ops.reshape_and_cache( + attention_kernels.reshape_and_cache( key, value, key_cache, value_cache, slots, "auto", 1.0 ) elif SYSTEM == "rocm": diff --git a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py index f70bff4f..57118362 100644 --- a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py @@ -23,8 +23,10 @@ from typing import Optional, List, Tuple, Any from text_generation_server.layers.attention.kv_cache import get_kv_scales from text_generation_server.utils.import_utils import SYSTEM -if SYSTEM != "ipex": +if SYSTEM == "rocm": from vllm.model_executor.layers.fused_moe import fused_moe +elif SYSTEM != "ipex": + from moe_kernels.fused_moe import fused_moe from text_generation_server.layers.attention import ( paged_attention,