From 97f7a22f0b0f57edc840beaf152e7fd102ed8311 Mon Sep 17 00:00:00 2001 From: "Wang, Yi" Date: Thu, 7 Nov 2024 21:43:38 +0800 Subject: [PATCH 01/11] add trust_remote_code in tokenizer to fix baichuan issue (#2725) Signed-off-by: Wang, Yi A --- router/src/lib.rs | 10 ++++++++-- router/src/server.rs | 1 + router/src/validation.rs | 4 +++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/router/src/lib.rs b/router/src/lib.rs index a5613f89..d9cacb91 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -27,6 +27,7 @@ pub enum Tokenizer { Python { tokenizer_name: String, revision: Option, + trust_remote_code: bool, }, Rust(tokenizers::Tokenizer), } @@ -38,15 +39,20 @@ impl<'a> PyTokenizer<'a> { py: Python<'a>, tokenizer_name: String, revision: Option, + trust_remote_code: bool, ) -> PyResult> { let transformers = py.import_bound("transformers")?; let auto = transformers.getattr("AutoTokenizer")?; let from_pretrained = auto.getattr("from_pretrained")?; let args = (tokenizer_name,); let kwargs = if let Some(rev) = &revision { - [("revision", rev.to_string())].into_py_dict_bound(py) + [ + ("revision", rev.to_string().into_py(py)), + ("trust_remote_code", trust_remote_code.into_py(py)), + ] + .into_py_dict_bound(py) } else { - pyo3::types::PyDict::new_bound(py) + [("trust_remote_code", trust_remote_code.into_py(py))].into_py_dict_bound(py) }; let tokenizer = from_pretrained.call(args, Some(&kwargs))?; tracing::info!("Loaded a python tokenizer"); diff --git a/router/src/server.rs b/router/src/server.rs index 7d8d518c..2058bce3 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -1829,6 +1829,7 @@ pub async fn run( Tokenizer::Python { tokenizer_name: tokenizer_name.clone(), revision: revision.clone(), + trust_remote_code, } } }; diff --git a/router/src/validation.rs b/router/src/validation.rs index 5b2a153c..3cd85a6e 100644 --- a/router/src/validation.rs +++ b/router/src/validation.rs @@ -439,9 +439,11 @@ fn tokenizer_worker( Tokenizer::Python { tokenizer_name, revision, + trust_remote_code, } => { pyo3::Python::with_gil(|py| -> pyo3::PyResult<()> { - let tokenizer = PyTokenizer::from_py(py, tokenizer_name, revision)?; + let tokenizer = + PyTokenizer::from_py(py, tokenizer_name, revision, trust_remote_code)?; // Loop over requests while let Some(((inputs, add_special_tokens, truncate), response_tx, parent_span)) = receiver.blocking_recv() From a7850008429c4c1c4a2ded7bbed4c1b12d22d287 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Sun, 10 Nov 2024 13:54:07 +0100 Subject: [PATCH 02/11] Add initial support for compressed-tensors checkpoints (#2732) compressed-tensors is a safetensors extension for sparse, quantized tensors. The format is more powerful than earlier AWQ/GPTQ/FP8 quantization, because - Different quantizer configurations can be used for different targets. - The format can specify input/output quantizers in addition to weight quantizers. - Configurable exclusions for quantization. This change adds a dependency on the `compressed-tensors` package for its configuration parsing and layer matching functionality. The following types of quantization are supported in this PR: - W8A16 and W4A16 INT using GPTQ-Marlin kernels. - W8A8 and W8A16 FP using FP8-Marlin and cutlass kernels. Support for other quantization types will be added in subsequent PRs. --- Dockerfile | 2 +- Dockerfile_amd | 2 +- Dockerfile_intel | 2 +- docs/source/reference/launcher.md | 19 +- flake.lock | 7 +- flake.nix | 2 +- .../test_compressed_tensors_w8an.json | 104 +++++ ...st_compressed_tensors_w8an_all_params.json | 99 +++++ .../test_compressed_tensors_w8an_load.json | 418 ++++++++++++++++++ .../test_compressed_tensors_wna16.json | 104 +++++ ...t_compressed_tensors_wna16_all_params.json | 99 +++++ .../test_compressed_tensors_wna16_load.json | 418 ++++++++++++++++++ .../models/test_compressed_tensors_w8an_fp.py | 86 ++++ .../test_compressed_tensors_wna16_int.py | 86 ++++ launcher/src/main.rs | 5 + nix/server.nix | 2 + server/Makefile | 2 +- server/poetry.lock | 24 +- server/pyproject.toml | 2 + server/text_generation_server/cli.py | 1 + .../layers/compressed_tensors/__init__.py | 3 + .../layers/compressed_tensors/loader.py | 174 ++++++++ .../layers/compressed_tensors/w8an_fp.py | 174 ++++++++ .../layers/compressed_tensors/wna16_int.py | 188 ++++++++ server/text_generation_server/layers/fp8.py | 18 +- .../layers/marlin/gptq.py | 4 +- .../text_generation_server/models/__init__.py | 37 +- .../utils/quantization.py | 48 +- 28 files changed, 2052 insertions(+), 78 deletions(-) create mode 100644 integration-tests/models/__snapshots__/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an.json create mode 100644 integration-tests/models/__snapshots__/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_all_params.json create mode 100644 integration-tests/models/__snapshots__/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_load.json create mode 100644 integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16.json create mode 100644 integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_all_params.json create mode 100644 integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_load.json create mode 100644 integration-tests/models/test_compressed_tensors_w8an_fp.py create mode 100644 integration-tests/models/test_compressed_tensors_wna16_int.py create mode 100644 server/text_generation_server/layers/compressed_tensors/__init__.py create mode 100644 server/text_generation_server/layers/compressed_tensors/loader.py create mode 100644 server/text_generation_server/layers/compressed_tensors/w8an_fp.py create mode 100644 server/text_generation_server/layers/compressed_tensors/wna16_int.py diff --git a/Dockerfile b/Dockerfile index d4189c9f..565f3779 100644 --- a/Dockerfile +++ b/Dockerfile @@ -247,7 +247,7 @@ COPY server/Makefile server/Makefile RUN cd server && \ make gen-server && \ pip install -r requirements_cuda.txt && \ - pip install ".[bnb, accelerate, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \ + pip install ".[bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \ pip install nvidia-nccl-cu12==2.22.3 ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2 diff --git a/Dockerfile_amd b/Dockerfile_amd index b84d4edd..7638947a 100644 --- a/Dockerfile_amd +++ b/Dockerfile_amd @@ -296,7 +296,7 @@ COPY server/Makefile server/Makefile RUN cd server && \ make gen-server && \ pip install -r requirements_rocm.txt && \ - pip install ".[accelerate, peft, outlines]" --no-cache-dir + pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir # Install benchmarker COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark diff --git a/Dockerfile_intel b/Dockerfile_intel index f9b1cd13..c3555eab 100644 --- a/Dockerfile_intel +++ b/Dockerfile_intel @@ -102,7 +102,7 @@ COPY server/Makefile server/Makefile RUN cd server && \ make gen-server && \ pip install -r requirements_intel.txt && \ - pip install ".[accelerate, peft, outlines]" --no-cache-dir + pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir ENV CCL_ROOT=/opt/intel/oneapi/ccl/latest ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest diff --git a/docs/source/reference/launcher.md b/docs/source/reference/launcher.md index da0c8717..da52d59a 100644 --- a/docs/source/reference/launcher.md +++ b/docs/source/reference/launcher.md @@ -62,15 +62,16 @@ Options: [env: QUANTIZE=] Possible values: - - awq: 4 bit quantization. Requires a specific AWQ quantized model: . Should replace GPTQ models wherever possible because of the better latency - - eetq: 8 bit quantization, doesn't require specific model. Should be a drop-in replacement to bitsandbytes with much better performance. Kernels are from - - exl2: Variable bit quantization. Requires a specific EXL2 quantized model: . Requires exllama2 kernels and does not support tensor parallelism (num_shard > 1) - - gptq: 4 bit quantization. Requires a specific GTPQ quantized model: . text-generation-inference will use exllama (faster) kernels wherever possible, and use triton kernel (wider support) when it's not. AWQ has faster kernels - - marlin: 4 bit quantization. Requires a specific Marlin quantized model: - - bitsandbytes: Bitsandbytes 8bit. Can be applied on any model, will cut the memory requirement in half, but it is known that the model will be much slower to run than the native f16 - - bitsandbytes-nf4: Bitsandbytes 4bit. Can be applied on any model, will cut the memory requirement by 4x, but it is known that the model will be much slower to run than the native f16 - - bitsandbytes-fp4: Bitsandbytes 4bit. nf4 should be preferred in most cases but maybe this one has better perplexity performance for you model - - fp8: [FP8](https://developer.nvidia.com/blog/nvidia-arm-and-intel-publish-fp8-specification-for-standardization-as-an-interchange-format-for-ai/) (e4m3) works on H100 and above This dtype has native ops should be the fastest if available. This is currently not the fastest because of local unpacking + padding to satisfy matrix multiplication limitations + - awq: 4 bit quantization. Requires a specific AWQ quantized model: . Should replace GPTQ models wherever possible because of the better latency + - compressed-tensors: Compressed tensors, which can be a mixture of different quantization methods + - eetq: 8 bit quantization, doesn't require specific model. Should be a drop-in replacement to bitsandbytes with much better performance. Kernels are from + - exl2: Variable bit quantization. Requires a specific EXL2 quantized model: . Requires exllama2 kernels and does not support tensor parallelism (num_shard > 1) + - gptq: 4 bit quantization. Requires a specific GTPQ quantized model: . text-generation-inference will use exllama (faster) kernels wherever possible, and use triton kernel (wider support) when it's not. AWQ has faster kernels + - marlin: 4 bit quantization. Requires a specific Marlin quantized model: + - bitsandbytes: Bitsandbytes 8bit. Can be applied on any model, will cut the memory requirement in half, but it is known that the model will be much slower to run than the native f16 + - bitsandbytes-nf4: Bitsandbytes 4bit. Can be applied on any model, will cut the memory requirement by 4x, but it is known that the model will be much slower to run than the native f16 + - bitsandbytes-fp4: Bitsandbytes 4bit. nf4 should be preferred in most cases but maybe this one has better perplexity performance for you model + - fp8: [FP8](https://developer.nvidia.com/blog/nvidia-arm-and-intel-publish-fp8-specification-for-standardization-as-an-interchange-format-for-ai/) (e4m3) works on H100 and above This dtype has native ops should be the fastest if available. This is currently not the fastest because of local unpacking + padding to satisfy matrix multiplication limitations ``` ## SPECULATE diff --git a/flake.lock b/flake.lock index 5246f424..c5515ae2 100644 --- a/flake.lock +++ b/flake.lock @@ -978,15 +978,16 @@ "nixpkgs": "nixpkgs_6" }, "locked": { - "lastModified": 1730724647, - "narHash": "sha256-SVv+50CGaCoU4zZwsg6ZAaOi/D5QJBL1P2SIB+3CEf4=", + "lastModified": 1730795478, + "narHash": "sha256-xpkXDKnkhXO4F6Ea3reHmqwXXRzQe2PsxdRQFPCViWs=", "owner": "huggingface", "repo": "text-generation-inference-nix", - "rev": "1512898a1e5ad9eff025205fa9c4d33a44506cf3", + "rev": "b7f6c07867d94d6e55f5352573a6b3dad1c88e56", "type": "github" }, "original": { "owner": "huggingface", + "ref": "compressed-tensors-0.7.1", "repo": "text-generation-inference-nix", "type": "github" } diff --git a/flake.nix b/flake.nix index f26a983e..1a1e6fe2 100644 --- a/flake.nix +++ b/flake.nix @@ -5,7 +5,7 @@ inputs.nixpkgs.follows = "tgi-nix/nixpkgs"; }; nix-filter.url = "github:numtide/nix-filter"; - tgi-nix.url = "github:huggingface/text-generation-inference-nix"; + tgi-nix.url = "github:huggingface/text-generation-inference-nix/compressed-tensors-0.7.1"; nixpkgs.follows = "tgi-nix/nixpkgs"; flake-utils.url = "github:numtide/flake-utils"; rust-overlay = { diff --git a/integration-tests/models/__snapshots__/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an.json b/integration-tests/models/__snapshots__/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an.json new file mode 100644 index 00000000..c53a036f --- /dev/null +++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an.json @@ -0,0 +1,104 @@ +{ + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 128000, + "logprob": null, + "text": "<|begin_of_text|>" + }, + { + "id": 3923, + "logprob": -7.609375, + "text": "What" + }, + { + "id": 374, + "logprob": -0.92529297, + "text": " is" + }, + { + "id": 5655, + "logprob": -10.0, + "text": " deep" + }, + { + "id": 6975, + "logprob": -0.94628906, + "text": " learning" + }, + { + "id": 30, + "logprob": -2.9042969, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 18682, + "logprob": -0.8769531, + "special": false, + "text": " Deep" + }, + { + "id": 6975, + "logprob": -0.0076942444, + "special": false, + "text": " learning" + }, + { + "id": 374, + "logprob": -0.25073242, + "special": false, + "text": " is" + }, + { + "id": 264, + "logprob": -0.097595215, + "special": false, + "text": " a" + }, + { + "id": 955, + "logprob": -0.921875, + "special": false, + "text": " type" + }, + { + "id": 315, + "logprob": -0.00027918816, + "special": false, + "text": " of" + }, + { + "id": 21075, + "logprob": -0.5527344, + "special": false, + "text": " artificial" + }, + { + "id": 11478, + "logprob": -0.042541504, + "special": false, + "text": " intelligence" + }, + { + "id": 320, + "logprob": -0.38891602, + "special": false, + "text": " (" + }, + { + "id": 15836, + "logprob": -0.0011043549, + "special": false, + "text": "AI" + } + ], + "top_tokens": null + }, + "generated_text": " Deep learning is a type of artificial intelligence (AI" +} diff --git a/integration-tests/models/__snapshots__/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_all_params.json b/integration-tests/models/__snapshots__/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_all_params.json new file mode 100644 index 00000000..bb1d6f0e --- /dev/null +++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_all_params.json @@ -0,0 +1,99 @@ +{ + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 128000, + "logprob": null, + "text": "<|begin_of_text|>" + }, + { + "id": 3923, + "logprob": -7.609375, + "text": "What" + }, + { + "id": 374, + "logprob": -0.92529297, + "text": " is" + }, + { + "id": 5655, + "logprob": -10.0, + "text": " deep" + }, + { + "id": 6975, + "logprob": -0.94628906, + "text": " learning" + } + ], + "seed": 0, + "tokens": [ + { + "id": 5380, + "logprob": -0.23840332, + "special": false, + "text": "?\n" + }, + { + "id": 34564, + "logprob": 0.0, + "special": false, + "text": "Deep" + }, + { + "id": 6975, + "logprob": 0.0, + "special": false, + "text": " learning" + }, + { + "id": 11, + "logprob": 0.0, + "special": false, + "text": "," + }, + { + "id": 1101, + "logprob": -1.2011719, + "special": false, + "text": " also" + }, + { + "id": 3967, + "logprob": 0.0, + "special": false, + "text": " known" + }, + { + "id": 439, + "logprob": 0.0, + "special": false, + "text": " as" + }, + { + "id": 30828, + "logprob": 0.0, + "special": false, + "text": " neural" + }, + { + "id": 4009, + "logprob": -0.6777344, + "special": false, + "text": " network" + }, + { + "id": 477, + "logprob": 0.0, + "special": false, + "text": " or" + } + ], + "top_tokens": null + }, + "generated_text": "What is deep learning?\nDeep learning, also known as neural network or" +} diff --git a/integration-tests/models/__snapshots__/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_load.json b/integration-tests/models/__snapshots__/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_load.json new file mode 100644 index 00000000..09f9e3a7 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_load.json @@ -0,0 +1,418 @@ +[ + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 128000, + "logprob": null, + "text": "<|begin_of_text|>" + }, + { + "id": 3923, + "logprob": -7.609375, + "text": "What" + }, + { + "id": 374, + "logprob": -0.92529297, + "text": " is" + }, + { + "id": 5655, + "logprob": -10.0, + "text": " deep" + }, + { + "id": 6975, + "logprob": -0.94628906, + "text": " learning" + }, + { + "id": 30, + "logprob": -2.9042969, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 18682, + "logprob": -0.8769531, + "special": false, + "text": " Deep" + }, + { + "id": 6975, + "logprob": -0.0076942444, + "special": false, + "text": " learning" + }, + { + "id": 374, + "logprob": -0.25146484, + "special": false, + "text": " is" + }, + { + "id": 264, + "logprob": -0.097595215, + "special": false, + "text": " a" + }, + { + "id": 955, + "logprob": -0.9248047, + "special": false, + "text": " type" + }, + { + "id": 315, + "logprob": -0.00027513504, + "special": false, + "text": " of" + }, + { + "id": 21075, + "logprob": -0.5527344, + "special": false, + "text": " artificial" + }, + { + "id": 11478, + "logprob": -0.043151855, + "special": false, + "text": " intelligence" + }, + { + "id": 320, + "logprob": -0.3840332, + "special": false, + "text": " (" + }, + { + "id": 15836, + "logprob": -0.0011043549, + "special": false, + "text": "AI" + } + ], + "top_tokens": null + }, + "generated_text": " Deep learning is a type of artificial intelligence (AI" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 128000, + "logprob": null, + "text": "<|begin_of_text|>" + }, + { + "id": 3923, + "logprob": -7.6054688, + "text": "What" + }, + { + "id": 374, + "logprob": -0.92089844, + "text": " is" + }, + { + "id": 5655, + "logprob": -10.0, + "text": " deep" + }, + { + "id": 6975, + "logprob": -0.94433594, + "text": " learning" + }, + { + "id": 30, + "logprob": -2.90625, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 18682, + "logprob": -0.875, + "special": false, + "text": " Deep" + }, + { + "id": 6975, + "logprob": -0.007698059, + "special": false, + "text": " learning" + }, + { + "id": 374, + "logprob": -0.25268555, + "special": false, + "text": " is" + }, + { + "id": 264, + "logprob": -0.09753418, + "special": false, + "text": " a" + }, + { + "id": 955, + "logprob": -0.92529297, + "special": false, + "text": " type" + }, + { + "id": 315, + "logprob": -0.00027942657, + "special": false, + "text": " of" + }, + { + "id": 21075, + "logprob": -0.5527344, + "special": false, + "text": " artificial" + }, + { + "id": 11478, + "logprob": -0.042541504, + "special": false, + "text": " intelligence" + }, + { + "id": 320, + "logprob": -0.3840332, + "special": false, + "text": " (" + }, + { + "id": 15836, + "logprob": -0.0011053085, + "special": false, + "text": "AI" + } + ], + "top_tokens": null + }, + "generated_text": " Deep learning is a type of artificial intelligence (AI" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 128000, + "logprob": null, + "text": "<|begin_of_text|>" + }, + { + "id": 3923, + "logprob": -7.6054688, + "text": "What" + }, + { + "id": 374, + "logprob": -0.92089844, + "text": " is" + }, + { + "id": 5655, + "logprob": -10.0, + "text": " deep" + }, + { + "id": 6975, + "logprob": -0.94433594, + "text": " learning" + }, + { + "id": 30, + "logprob": -2.90625, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 18682, + "logprob": -0.875, + "special": false, + "text": " Deep" + }, + { + "id": 6975, + "logprob": -0.007698059, + "special": false, + "text": " learning" + }, + { + "id": 374, + "logprob": -0.25268555, + "special": false, + "text": " is" + }, + { + "id": 264, + "logprob": -0.09753418, + "special": false, + "text": " a" + }, + { + "id": 955, + "logprob": -0.92529297, + "special": false, + "text": " type" + }, + { + "id": 315, + "logprob": -0.00027942657, + "special": false, + "text": " of" + }, + { + "id": 21075, + "logprob": -0.5527344, + "special": false, + "text": " artificial" + }, + { + "id": 11478, + "logprob": -0.042541504, + "special": false, + "text": " intelligence" + }, + { + "id": 320, + "logprob": -0.3840332, + "special": false, + "text": " (" + }, + { + "id": 15836, + "logprob": -0.0011053085, + "special": false, + "text": "AI" + } + ], + "top_tokens": null + }, + "generated_text": " Deep learning is a type of artificial intelligence (AI" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 128000, + "logprob": null, + "text": "<|begin_of_text|>" + }, + { + "id": 3923, + "logprob": -7.6054688, + "text": "What" + }, + { + "id": 374, + "logprob": -0.92089844, + "text": " is" + }, + { + "id": 5655, + "logprob": -10.0, + "text": " deep" + }, + { + "id": 6975, + "logprob": -0.94433594, + "text": " learning" + }, + { + "id": 30, + "logprob": -2.90625, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 18682, + "logprob": -0.875, + "special": false, + "text": " Deep" + }, + { + "id": 6975, + "logprob": -0.007698059, + "special": false, + "text": " learning" + }, + { + "id": 374, + "logprob": -0.25268555, + "special": false, + "text": " is" + }, + { + "id": 264, + "logprob": -0.09753418, + "special": false, + "text": " a" + }, + { + "id": 955, + "logprob": -0.92529297, + "special": false, + "text": " type" + }, + { + "id": 315, + "logprob": -0.00027942657, + "special": false, + "text": " of" + }, + { + "id": 21075, + "logprob": -0.5527344, + "special": false, + "text": " artificial" + }, + { + "id": 11478, + "logprob": -0.042541504, + "special": false, + "text": " intelligence" + }, + { + "id": 320, + "logprob": -0.3840332, + "special": false, + "text": " (" + }, + { + "id": 15836, + "logprob": -0.0011053085, + "special": false, + "text": "AI" + } + ], + "top_tokens": null + }, + "generated_text": " Deep learning is a type of artificial intelligence (AI" + } +] diff --git a/integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16.json b/integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16.json new file mode 100644 index 00000000..bc4acf60 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16.json @@ -0,0 +1,104 @@ +{ + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 2, + "logprob": null, + "text": "" + }, + { + "id": 1841, + "logprob": -5.46875, + "text": "What" + }, + { + "id": 603, + "logprob": -0.69140625, + "text": " is" + }, + { + "id": 5271, + "logprob": -12.0, + "text": " deep" + }, + { + "id": 6044, + "logprob": -0.32226562, + "text": " learning" + }, + { + "id": 235336, + "logprob": -0.33203125, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 109, + "logprob": -0.24707031, + "special": false, + "text": "\n\n" + }, + { + "id": 26843, + "logprob": -0.14550781, + "special": false, + "text": "Deep" + }, + { + "id": 6044, + "logprob": -0.038330078, + "special": false, + "text": " learning" + }, + { + "id": 603, + "logprob": -0.029907227, + "special": false, + "text": " is" + }, + { + "id": 476, + "logprob": -0.020996094, + "special": false, + "text": " a" + }, + { + "id": 38397, + "logprob": -0.828125, + "special": false, + "text": " subset" + }, + { + "id": 576, + "logprob": -0.00049209595, + "special": false, + "text": " of" + }, + { + "id": 6479, + "logprob": -0.057373047, + "special": false, + "text": " machine" + }, + { + "id": 6044, + "logprob": -0.000207901, + "special": false, + "text": " learning" + }, + { + "id": 674, + "logprob": -0.15429688, + "special": false, + "text": " that" + } + ], + "top_tokens": null + }, + "generated_text": "\n\nDeep learning is a subset of machine learning that" +} diff --git a/integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_all_params.json b/integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_all_params.json new file mode 100644 index 00000000..9999f3ae --- /dev/null +++ b/integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_all_params.json @@ -0,0 +1,99 @@ +{ + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 2, + "logprob": null, + "text": "" + }, + { + "id": 1841, + "logprob": -5.46875, + "text": "What" + }, + { + "id": 603, + "logprob": -0.69140625, + "text": " is" + }, + { + "id": 5271, + "logprob": -12.0, + "text": " deep" + }, + { + "id": 6044, + "logprob": -0.32226562, + "text": " learning" + } + ], + "seed": 0, + "tokens": [ + { + "id": 235336, + "logprob": 0.0, + "special": false, + "text": "?" + }, + { + "id": 109, + "logprob": 0.0, + "special": false, + "text": "\n\n" + }, + { + "id": 26843, + "logprob": 0.0, + "special": false, + "text": "Deep" + }, + { + "id": 14715, + "logprob": -0.38671875, + "special": false, + "text": " Learning" + }, + { + "id": 603, + "logprob": 0.0, + "special": false, + "text": " is" + }, + { + "id": 476, + "logprob": 0.0, + "special": false, + "text": " a" + }, + { + "id": 38397, + "logprob": -0.12695312, + "special": false, + "text": " subset" + }, + { + "id": 576, + "logprob": 0.0, + "special": false, + "text": " of" + }, + { + "id": 6479, + "logprob": 0.0, + "special": false, + "text": " machine" + }, + { + "id": 6044, + "logprob": 0.0, + "special": false, + "text": " learning" + } + ], + "top_tokens": null + }, + "generated_text": "What is deep learning?\n\nDeep Learning is a subset of machine learning" +} diff --git a/integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_load.json b/integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_load.json new file mode 100644 index 00000000..a4b3b590 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_load.json @@ -0,0 +1,418 @@ +[ + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 2, + "logprob": null, + "text": "" + }, + { + "id": 1841, + "logprob": -5.46875, + "text": "What" + }, + { + "id": 603, + "logprob": -0.69140625, + "text": " is" + }, + { + "id": 5271, + "logprob": -12.0, + "text": " deep" + }, + { + "id": 6044, + "logprob": -0.32226562, + "text": " learning" + }, + { + "id": 235336, + "logprob": -0.33203125, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 109, + "logprob": -0.24707031, + "special": false, + "text": "\n\n" + }, + { + "id": 26843, + "logprob": -0.14550781, + "special": false, + "text": "Deep" + }, + { + "id": 6044, + "logprob": -0.03857422, + "special": false, + "text": " learning" + }, + { + "id": 603, + "logprob": -0.030883789, + "special": false, + "text": " is" + }, + { + "id": 476, + "logprob": -0.020996094, + "special": false, + "text": " a" + }, + { + "id": 38397, + "logprob": -0.828125, + "special": false, + "text": " subset" + }, + { + "id": 576, + "logprob": -0.00051498413, + "special": false, + "text": " of" + }, + { + "id": 6479, + "logprob": -0.05883789, + "special": false, + "text": " machine" + }, + { + "id": 6044, + "logprob": -0.00020694733, + "special": false, + "text": " learning" + }, + { + "id": 674, + "logprob": -0.15820312, + "special": false, + "text": " that" + } + ], + "top_tokens": null + }, + "generated_text": "\n\nDeep learning is a subset of machine learning that" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 2, + "logprob": null, + "text": "" + }, + { + "id": 1841, + "logprob": -5.46875, + "text": "What" + }, + { + "id": 603, + "logprob": -0.71484375, + "text": " is" + }, + { + "id": 5271, + "logprob": -12.0, + "text": " deep" + }, + { + "id": 6044, + "logprob": -0.30859375, + "text": " learning" + }, + { + "id": 235336, + "logprob": -0.3359375, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 109, + "logprob": -0.23828125, + "special": false, + "text": "\n\n" + }, + { + "id": 26843, + "logprob": -0.14550781, + "special": false, + "text": "Deep" + }, + { + "id": 6044, + "logprob": -0.038330078, + "special": false, + "text": " learning" + }, + { + "id": 603, + "logprob": -0.030883789, + "special": false, + "text": " is" + }, + { + "id": 476, + "logprob": -0.020996094, + "special": false, + "text": " a" + }, + { + "id": 38397, + "logprob": -0.80859375, + "special": false, + "text": " subset" + }, + { + "id": 576, + "logprob": -0.0005455017, + "special": false, + "text": " of" + }, + { + "id": 6479, + "logprob": -0.05908203, + "special": false, + "text": " machine" + }, + { + "id": 6044, + "logprob": -0.00020599365, + "special": false, + "text": " learning" + }, + { + "id": 674, + "logprob": -0.17285156, + "special": false, + "text": " that" + } + ], + "top_tokens": null + }, + "generated_text": "\n\nDeep learning is a subset of machine learning that" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 2, + "logprob": null, + "text": "" + }, + { + "id": 1841, + "logprob": -5.46875, + "text": "What" + }, + { + "id": 603, + "logprob": -0.71484375, + "text": " is" + }, + { + "id": 5271, + "logprob": -12.0, + "text": " deep" + }, + { + "id": 6044, + "logprob": -0.30859375, + "text": " learning" + }, + { + "id": 235336, + "logprob": -0.3359375, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 109, + "logprob": -0.23828125, + "special": false, + "text": "\n\n" + }, + { + "id": 26843, + "logprob": -0.14550781, + "special": false, + "text": "Deep" + }, + { + "id": 6044, + "logprob": -0.038330078, + "special": false, + "text": " learning" + }, + { + "id": 603, + "logprob": -0.030883789, + "special": false, + "text": " is" + }, + { + "id": 476, + "logprob": -0.020996094, + "special": false, + "text": " a" + }, + { + "id": 38397, + "logprob": -0.80859375, + "special": false, + "text": " subset" + }, + { + "id": 576, + "logprob": -0.0005455017, + "special": false, + "text": " of" + }, + { + "id": 6479, + "logprob": -0.05908203, + "special": false, + "text": " machine" + }, + { + "id": 6044, + "logprob": -0.00020599365, + "special": false, + "text": " learning" + }, + { + "id": 674, + "logprob": -0.17285156, + "special": false, + "text": " that" + } + ], + "top_tokens": null + }, + "generated_text": "\n\nDeep learning is a subset of machine learning that" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 2, + "logprob": null, + "text": "" + }, + { + "id": 1841, + "logprob": -5.46875, + "text": "What" + }, + { + "id": 603, + "logprob": -0.71484375, + "text": " is" + }, + { + "id": 5271, + "logprob": -12.0, + "text": " deep" + }, + { + "id": 6044, + "logprob": -0.30859375, + "text": " learning" + }, + { + "id": 235336, + "logprob": -0.3359375, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 109, + "logprob": -0.23828125, + "special": false, + "text": "\n\n" + }, + { + "id": 26843, + "logprob": -0.14550781, + "special": false, + "text": "Deep" + }, + { + "id": 6044, + "logprob": -0.038330078, + "special": false, + "text": " learning" + }, + { + "id": 603, + "logprob": -0.030883789, + "special": false, + "text": " is" + }, + { + "id": 476, + "logprob": -0.020996094, + "special": false, + "text": " a" + }, + { + "id": 38397, + "logprob": -0.80859375, + "special": false, + "text": " subset" + }, + { + "id": 576, + "logprob": -0.0005455017, + "special": false, + "text": " of" + }, + { + "id": 6479, + "logprob": -0.05908203, + "special": false, + "text": " machine" + }, + { + "id": 6044, + "logprob": -0.00020599365, + "special": false, + "text": " learning" + }, + { + "id": 674, + "logprob": -0.17285156, + "special": false, + "text": " that" + } + ], + "top_tokens": null + }, + "generated_text": "\n\nDeep learning is a subset of machine learning that" + } +] diff --git a/integration-tests/models/test_compressed_tensors_w8an_fp.py b/integration-tests/models/test_compressed_tensors_w8an_fp.py new file mode 100644 index 00000000..09b16380 --- /dev/null +++ b/integration-tests/models/test_compressed_tensors_w8an_fp.py @@ -0,0 +1,86 @@ +import pytest + + +@pytest.fixture(scope="module") +def compressed_tensors_w8an_handle(launcher): + with launcher( + "neuralmagic/Llama-3.2-1B-Instruct-FP8", + num_shard=2, + quantize="compressed-tensors", + ) as handle: + yield handle + + +@pytest.fixture(scope="module") +async def compressed_tensors_w8an(compressed_tensors_w8an_handle): + await compressed_tensors_w8an_handle.health(300) + return compressed_tensors_w8an_handle.client + + +@pytest.mark.release +@pytest.mark.asyncio +@pytest.mark.private +async def test_compressed_tensors_w8an(compressed_tensors_w8an, response_snapshot): + response = await compressed_tensors_w8an.generate( + "What is deep learning?", + max_new_tokens=10, + decoder_input_details=True, + ) + + assert ( + response.generated_text + == " Deep learning is a type of artificial intelligence (AI" + ) + assert response.details.generated_tokens == 10 + assert response == response_snapshot + + +@pytest.mark.asyncio +async def test_compressed_tensors_w8an_all_params( + compressed_tensors_w8an, response_snapshot +): + response = await compressed_tensors_w8an.generate( + "What is deep learning", + max_new_tokens=10, + repetition_penalty=1.2, + return_full_text=True, + stop_sequences=["test"], + temperature=0.5, + top_p=0.9, + top_k=10, + truncate=5, + typical_p=0.9, + watermark=True, + decoder_input_details=True, + seed=0, + ) + + assert response.details.generated_tokens == 10 + assert ( + response.generated_text + == "What is deep learning?\nDeep learning, also known as neural network or" + ) + assert response == response_snapshot + + +@pytest.mark.release +@pytest.mark.asyncio +@pytest.mark.private +async def test_compressed_tensors_w8an_load( + compressed_tensors_w8an, generate_load, response_snapshot +): + responses = await generate_load( + compressed_tensors_w8an, + "What is deep learning?", + max_new_tokens=10, + n=4, + ) + + assert ( + responses[0].generated_text + == " Deep learning is a type of artificial intelligence (AI" + ) + assert len(responses) == 4 + assert all([r.generated_text == responses[0].generated_text for r in responses]) + + assert responses == response_snapshot diff --git a/integration-tests/models/test_compressed_tensors_wna16_int.py b/integration-tests/models/test_compressed_tensors_wna16_int.py new file mode 100644 index 00000000..1de86b1e --- /dev/null +++ b/integration-tests/models/test_compressed_tensors_wna16_int.py @@ -0,0 +1,86 @@ +import pytest + + +@pytest.fixture(scope="module") +def compressed_tensors_wna16_handle(launcher): + with launcher( + "neuralmagic/gemma-2-2b-it-quantized.w4a16", + num_shard=2, + quantize="compressed-tensors", + ) as handle: + yield handle + + +@pytest.fixture(scope="module") +async def compressed_tensors_wna16(compressed_tensors_wna16_handle): + await compressed_tensors_wna16_handle.health(300) + return compressed_tensors_wna16_handle.client + + +@pytest.mark.release +@pytest.mark.asyncio +@pytest.mark.private +async def test_compressed_tensors_wna16(compressed_tensors_wna16, response_snapshot): + response = await compressed_tensors_wna16.generate( + "What is deep learning?", + max_new_tokens=10, + decoder_input_details=True, + ) + + assert ( + response.generated_text + == "\n\nDeep learning is a subset of machine learning that" + ) + assert response.details.generated_tokens == 10 + assert response == response_snapshot + + +@pytest.mark.asyncio +async def test_compressed_tensors_wna16_all_params( + compressed_tensors_wna16, response_snapshot +): + response = await compressed_tensors_wna16.generate( + "What is deep learning", + max_new_tokens=10, + repetition_penalty=1.2, + return_full_text=True, + stop_sequences=["test"], + temperature=0.5, + top_p=0.9, + top_k=10, + truncate=5, + typical_p=0.9, + watermark=True, + decoder_input_details=True, + seed=0, + ) + + assert response.details.generated_tokens == 10 + assert ( + response.generated_text + == "What is deep learning?\n\nDeep Learning is a subset of machine learning" + ) + assert response == response_snapshot + + +@pytest.mark.release +@pytest.mark.asyncio +@pytest.mark.private +async def test_compressed_tensors_wna16_load( + compressed_tensors_wna16, generate_load, response_snapshot +): + responses = await generate_load( + compressed_tensors_wna16, + "What is deep learning?", + max_new_tokens=10, + n=4, + ) + + assert ( + responses[0].generated_text + == "\n\nDeep learning is a subset of machine learning that" + ) + assert len(responses) == 4 + assert all([r.generated_text == responses[0].generated_text for r in responses]) + + assert responses == response_snapshot diff --git a/launcher/src/main.rs b/launcher/src/main.rs index 64f4f515..510fa28c 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -212,6 +212,8 @@ enum Quantization { /// . /// Should replace GPTQ models wherever possible because of the better latency Awq, + /// Compressed tensors, which can be a mixture of different quantization methods. + CompressedTensors, /// 8 bit quantization, doesn't require specific model. /// Should be a drop-in replacement to bitsandbytes with much better performance. /// Kernels are from @@ -274,6 +276,9 @@ impl std::fmt::Display for Quantization { Quantization::Awq => { write!(f, "awq") } + Quantization::CompressedTensors => { + write!(f, "compressed-tensors") + } Quantization::Eetq => { write!(f, "eetq") } diff --git a/nix/server.nix b/nix/server.nix index 40915546..a96e53ac 100644 --- a/nix/server.nix +++ b/nix/server.nix @@ -5,6 +5,7 @@ mypy-protobuf, awq-inference-engine, causal-conv1d, + compressed-tensors, eetq, einops, exllamav2, @@ -74,6 +75,7 @@ buildPythonPackage { awq-inference-engine eetq causal-conv1d + compressed-tensors einops exllamav2 flashinfer diff --git a/server/Makefile b/server/Makefile index 018d3d8c..5f9f9654 100644 --- a/server/Makefile +++ b/server/Makefile @@ -23,7 +23,7 @@ gen-server: install-server: gen-server pip install pip --upgrade pip install -r requirements_cuda.txt - pip install -e ".[accelerate, quantize, peft, outlines]" + pip install -e ".[accelerate, compressed-tensors, quantize, peft, outlines]" install: install-cuda diff --git a/server/poetry.lock b/server/poetry.lock index 1f096035..d5b84de3 100644 --- a/server/poetry.lock +++ b/server/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. [[package]] name = "accelerate" @@ -388,6 +388,26 @@ files = [ {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] +[[package]] +name = "compressed-tensors" +version = "0.7.1" +description = "Library for utilization of compressed safetensors of neural network models" +optional = true +python-versions = "*" +files = [ + {file = "compressed-tensors-0.7.1.tar.gz", hash = "sha256:3c7865ebfe4ea76ae94d7c674bcf93aedd2064571f682c09a377a219d5ebb3a0"}, + {file = "compressed_tensors-0.7.1-py3-none-any.whl", hash = "sha256:22d11558a70f655ae647db9c8e9fb14a5e9d6983ca5aec3f267518625fd6dd0e"}, +] + +[package.dependencies] +pydantic = ">=2.0" +torch = ">=1.7.0" +transformers = "*" + +[package.extras] +accelerate = ["accelerate"] +dev = ["black (==22.12.0)", "flake8 (>=3.8.3)", "isort (==5.8.0)", "nbconvert (>=7.16.3)", "pytest (>=6.0.0)", "wheel (>=0.36.2)"] + [[package]] name = "datasets" version = "2.21.0" @@ -3982,4 +4002,4 @@ torch = ["torch"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.13" -content-hash = "b39033e573f50a0f046787aebf1702d86673aad0b2fcee818404fcea7f644b81" +content-hash = "4636689efd4c94559c3c23903aafcffd177533a3b9006b3b4f8491b158a3a754" diff --git a/server/pyproject.toml b/server/pyproject.toml index 5c414d6e..91ddfd6c 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -37,6 +37,7 @@ pillow = "^10.0.0" outlines= { version = "^0.0.34", optional = true } prometheus-client = "^0.20.0" py-cpuinfo = "^9.0.0" +compressed-tensors = { version = "^0.7.1", optional = true } # Remove later, temporary workaround for outlines. numpy = "^1.26" @@ -58,6 +59,7 @@ rich = "^13.7.1" torch = ["torch"] accelerate = ["accelerate"] bnb = ["bitsandbytes"] +compressed-tensors = ["compressed-tensors"] marlin = ["marlin-kernels"] moe = ["moe-kernels"] peft = ["peft"] diff --git a/server/text_generation_server/cli.py b/server/text_generation_server/cli.py index a363b33a..d8155b49 100644 --- a/server/text_generation_server/cli.py +++ b/server/text_generation_server/cli.py @@ -19,6 +19,7 @@ class Quantization(str, Enum): bitsandbytes_fp4 = "bitsandbytes-fp4" gptq = "gptq" awq = "awq" + compressed_tensors = "compressed-tensors" eetq = "eetq" exl2 = "exl2" fp8 = "fp8" diff --git a/server/text_generation_server/layers/compressed_tensors/__init__.py b/server/text_generation_server/layers/compressed_tensors/__init__.py new file mode 100644 index 00000000..507af706 --- /dev/null +++ b/server/text_generation_server/layers/compressed_tensors/__init__.py @@ -0,0 +1,3 @@ +from .loader import CompressedTensorsLoader + +__all__ = ["CompressedTensorsLoader"] diff --git a/server/text_generation_server/layers/compressed_tensors/loader.py b/server/text_generation_server/layers/compressed_tensors/loader.py new file mode 100644 index 00000000..e5ad3529 --- /dev/null +++ b/server/text_generation_server/layers/compressed_tensors/loader.py @@ -0,0 +1,174 @@ +from typing import Any, Dict, List, Union + +from compressed_tensors import QuantizationConfig, QuantizationStatus +from compressed_tensors.config import CompressionFormat +from compressed_tensors.quantization import ( + QuantizationScheme, + QuantizationType, + find_name_or_class_matches, +) +from loguru import logger +from pydantic import ValidationError +from torch import nn + +from text_generation_server.layers.compressed_tensors.w8an_fp import W8ANFpLoader +from text_generation_server.layers.compressed_tensors.wna16_int import WNA16Loader +from text_generation_server.utils.log import log_once +from text_generation_server.utils.weights import ( + DefaultWeightsLoader, + UnquantizedWeight, + Weights, + WeightsLoader, +) + +# compressed-tensors can match modules as quantization targets. However, +# they need to be objects rather than classes or class names. Since we +# need to match `Linear` targets, make an instance that can be re-used. +_EMPTY_LINEAR: nn.Module = nn.Linear(0, 0) + + +class CompressedTensorsLoader(WeightsLoader): + """Loader for checkpoints stored in the compressed-tensors format.""" + + def __init__(self, config: Dict[str, Any]): + quantization_config_raw = config.get("quantization_config") + if quantization_config_raw is None: + # `compression_config` was renamed to `quantization_config`; support + # retained for backward compatibility. + quantization_config_raw = config.get("compression_config") + if quantization_config_raw is None: + raise ValueError( + "Checkpoint does not have compressed-tensors configuration" + ) + + try: + quantization_config = QuantizationConfig.model_validate( + quantization_config_raw + ) + except ValidationError as e: + raise ValueError("Cannot parse compressed-tensors configuration") from e + + if quantization_config.quantization_status not in ( + QuantizationStatus.COMPRESSED, + QuantizationStatus.FROZEN, + ): + raise ValueError( + f"Model quantization was not finished, status was: {quantization_config.quantization_status}" + ) + + self.ignore = ( + quantization_config.ignore if quantization_config.ignore is not None else [] + ) + self.loaders = self._get_target_loaders(quantization_config) + + for target, loader in self.loaders.items(): + log_once( + logger.info, + f"Using {loader} for compressed-tensors target '{target}'", + ) + + def get_weights(self, weights: Weights, prefix: str): + loader = self._lookup_loader(prefix) + return loader.get_weights(weights, prefix) + + def get_weights_col_packed( + self, + weights: "Weights", + prefix: str, + block_sizes: Union[int, List[int]], + ): + loader = self._lookup_loader(prefix) + return loader.get_weights_col_packed(weights, prefix, block_sizes) + + def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int): + loader = self._lookup_loader(prefixes[0]) + return loader.get_multi_weights_col(weights, prefixes, dim) + + def get_weights_row(self, weights: Weights, prefix: str): + loader = self._lookup_loader(prefix) + return loader.get_weights_row(weights, prefix) + + def _get_target_loaders( + self, quantization_config: QuantizationConfig + ) -> Dict[str, WeightsLoader]: + """ + A compressed-tensors checkpoint can use different quantizations + for different targets. This method returns a dictionary with a + loader per target. + """ + + loaders: Dict[str, WeightsLoader] = {} + + format = quantization_config.format + + for group_name, group in quantization_config.config_groups.items(): + # The group configuration can be a string, but does that ever + # happen in a serialized quantization config? + assert isinstance(group, QuantizationScheme) + + loader = self._create_loader_for_group(format, group_name, group) + + # A quantized parameter group can have multiple targets, add the + # loader for all the targets. + for target in group.targets: + if target in loaders: + raise ValueError( + f"Target '{target} has multiple configured loaders'" + ) + loaders[target] = loader + + return loaders + + def _create_loader_for_group( + self, format: str, group_name: str, group: QuantizationScheme + ) -> WeightsLoader: + """ + Find and create a loader for the group with the given quantization + scheme. + """ + # NOTE: we ignore group.output_activations because we don't support + # output quantization yet. + + input_activations = group.input_activations + weights = group.weights + if ( + format + in { + CompressionFormat.float_quantized.value, + CompressionFormat.naive_quantized.value, + } + and weights is not None + and weights.type == QuantizationType.FLOAT + and weights.num_bits == 8 + ): + # FP W8A8 or W8A16. + return W8ANFpLoader(input_activations=input_activations, weights=weights) + elif ( + format == CompressionFormat.pack_quantized.value + and weights is not None + and weights.type == QuantizationType.INT + and weights.num_bits in (4, 8) + ): + # INT W4A16 or W8A16 (GPTQ/AWQ-like). + return WNA16Loader(weights) + else: + raise ValueError( + f"Group '{group_name}' has unsupported compressed-tensors configurtion" + ) + + def _lookup_loader(self, prefix: str) -> WeightsLoader: + """ + Look up the loader to use for a given parameter name (prefix). + """ + + if len(find_name_or_class_matches(prefix, _EMPTY_LINEAR, self.ignore)) > 0: + return DefaultWeightsLoader(UnquantizedWeight) + + # We currently only handle linear layers, so unconditionally pass + # a `Linear` instance. + targets = find_name_or_class_matches(prefix, _EMPTY_LINEAR, self.loaders.keys()) + if len(targets) == 0: + raise ValueError( + f"Cannot find compressed-tensors target for prefix: {prefix}" + ) + return self.loaders[targets[0]] diff --git a/server/text_generation_server/layers/compressed_tensors/w8an_fp.py b/server/text_generation_server/layers/compressed_tensors/w8an_fp.py new file mode 100644 index 00000000..e63c5212 --- /dev/null +++ b/server/text_generation_server/layers/compressed_tensors/w8an_fp.py @@ -0,0 +1,174 @@ +from typing import List, Optional, Union + +import torch +from compressed_tensors.quantization import QuantizationArgs, QuantizationType + +from text_generation_server.layers.fp8 import Fp8Weight, _load_scalar_or_matrix_scale +from text_generation_server.utils.weights import Weights, WeightsLoader + + +class W8ANFpLoader(WeightsLoader): + """ + Loader for W8A8/W8A16 FP compressed-tensors parameters. + """ + + def __init__( + self, + *, + input_activations: Optional[QuantizationArgs], + weights: QuantizationArgs, + ): + assert weights.type == QuantizationType.FLOAT and weights.num_bits == 8 + + # We ignore the `strategy` option which sets the scales to be + # per-tensor, per-channel or per-token. What scales are supported + # is dependent on the kernels used (e.g. cutlass can do tokenwise, + # Torch cannot, and FP8-Marlin does not quantize inputs at all). + # So, instead we try to use the best-possible configuration. + + self.load_weight_scale = not weights.dynamic + self.load_input_scale = ( + input_activations is not None and not input_activations.dynamic + ) + self.force_w8a16 = ( + input_activations is not None and input_activations.num_bits == 16 + ) + + def __str__(self) -> str: + def scale_to_str(scale): + return "static" if scale else "dynamic" + + quantization_type = f"W8A{16 if self.force_w8a16 else 8}" + + return f"{self.__class__.__name__} ({quantization_type}, weight: {scale_to_str(self.load_weight_scale)}, input: {scale_to_str(self.load_input_scale)})" + + def get_weights(self, weights: "Weights", prefix: str): + w = weights.get_tensor(f"{prefix}.weight") + + weight_scale = None + if self.load_weight_scale: + weight_scale = ( + weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False) + .reshape(-1) + .expand(w.shape[0]) + ) + + input_scale = None + if self.load_input_scale: + input_scale = weights.get_tensor( + f"{prefix}.input_scale", to_dtype=False + ).reshape(-1) + + return Fp8Weight( + weight=w, + weight_scale=weight_scale, + input_scale=input_scale, + dtype=weights.dtype, + force_w8a16=self.force_w8a16, + ) + + def get_weights_col_packed( + self, + weights: Weights, + prefix: str, + block_sizes: Union[int, List[int]], + ): + w = weights.get_packed_sharded( + f"{prefix}.weight", dim=0, block_sizes=block_sizes + ) + + weight_scale = None + if self.load_weight_scale: + weight_scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False) + if weight_scale.numel() > 1: + weight_scale = weights.get_packed_sharded( + f"{prefix}.weight_scale", + dim=0, + block_sizes=block_sizes, + to_dtype=False, + ) + weight_scale = weight_scale.reshape(-1).expand(w.shape[0]) + + input_scale = None + if self.load_input_scale: + input_scale = weights.get_tensor(f"{prefix}.input_scale", to_dtype=False) + if input_scale.numel() > 1: + input_scale = weights.get_packed_sharded( + f"{prefix}.input_scale", + dim=0, + block_sizes=block_sizes, + to_dtype=False, + ) + input_scale = input_scale.reshape(-1).max() + + return Fp8Weight( + weight=w, + weight_scale=weight_scale, + input_scale=input_scale, + dtype=weights.dtype, + force_w8a16=self.force_w8a16, + ) + + def get_multi_weights_col(self, weights: "Weights", prefixes: List[str], dim: int): + # FIXME: Force to_device to false as fp8 weights do not support torch.cat on device yet + w = [ + weights.get_sharded(f"{p}.weight", dim=0, to_device=False) for p in prefixes + ] + shapes = [x.shape for x in w] + + # Concat then send to the device + w = torch.cat(w, dim=dim).to(weights.device) + + weight_scale = None + if self.load_weight_scale: + weight_scale = [ + _load_scalar_or_matrix_scale(weights, f"{p}.weight_scale", shape) + for p, shape in zip(prefixes, shapes) + ] + weight_scale = torch.cat(weight_scale, dim=0).reshape(-1) + + input_scale = None + if self.load_input_scale: + input_scale = [ + _load_scalar_or_matrix_scale(weights, f"{p}.input_scale", shape) + for p, shape in zip(prefixes, shapes) + if weights.has_tensor(f"{p}.input_scale") + ] + assert len(input_scale) == 0 or len(input_scale) == len(prefixes) + input_scale = ( + torch.cat(input_scale, dim=0).reshape(-1).max() + if len(input_scale) != 0 + else None + ) + + return Fp8Weight( + weight=w, + weight_scale=weight_scale, + input_scale=input_scale, + dtype=weights.dtype, + force_w8a16=self.force_w8a16, + ) + + def get_weights_row(self, weights: "Weights", prefix: str): + w = weights.get_sharded(f"{prefix}.weight", dim=1) + weight_scale = None + if self.load_weight_scale: + weight_scale = ( + weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False) + .reshape(-1) + .expand(w.shape[0]) + ) + + input_scale = None + if self.load_input_scale: + input_scale = weights.get_tensor( + f"{prefix}.input_scale", to_dtype=False + ).reshape(-1) + + return Fp8Weight( + weight=w, + weight_scale=weight_scale, + input_scale=input_scale, + dtype=weights.dtype, + force_w8a16=self.force_w8a16, + ) diff --git a/server/text_generation_server/layers/compressed_tensors/wna16_int.py b/server/text_generation_server/layers/compressed_tensors/wna16_int.py new file mode 100644 index 00000000..a616867a --- /dev/null +++ b/server/text_generation_server/layers/compressed_tensors/wna16_int.py @@ -0,0 +1,188 @@ +from typing import List, Union + +import torch +from compressed_tensors.quantization import ActivationOrdering, QuantizationArgs +from loguru import logger + +from text_generation_server.layers.marlin.gptq import repack_gptq_for_marlin +from text_generation_server.utils.log import log_once +from text_generation_server.utils.weights import Weights, WeightsLoader + + +class WNA16Loader(WeightsLoader): + """ + Loader for W4A16/W8A16 INT compressed-tensors parameters. + """ + + def __init__(self, weights: QuantizationArgs): + self.weights = weights + self.desc_act = self.weights.actorder == ActivationOrdering.GROUP + self.groupsize = ( + -1 if self.weights.group_size is None else self.weights.group_size + ) + + def __str__(self) -> str: + quantization_type = f"W{self.weights.num_bits}8A16" + + return f"{self.__class__.__name__} ({quantization_type})" + + def get_weights(self, weights: Weights, prefix: str): + log_once(logger.info, "Using GPTQ-Marlin kernels") + try: + weight_packed = weights.get_tensor(f"{prefix}.weight_packed").t() + except RuntimeError: + raise RuntimeError( + f"Cannot load w{self.weights.num_bits}a16 weight, make sure the model is already quantized" + ) + + zero_point = None + if not self.weights.symmetric: + zero_point = weights.get_tensor(f"{prefix}.weight_zero_point").t() + + g_idx = None + if self.desc_act: + g_idx = weights.get_tensor(f"{prefix}.weight_g_idx") + + scales = weights.get_tensor(f"{prefix}.weight.scales").t() + + return repack_gptq_for_marlin( + qweight=weight_packed.contiguous(), + scales=scales, + qzeros=zero_point, + g_idx=g_idx, + bits=self.weights.num_bits, + desc_act=self.desc_act, + groupsize=self.groupsize, + quant_method="compressed-tensors", + sym=self.weights.symmetric, + sharded_infeatures=False, + ) + + def get_weights_col_packed( + self, + weights: Weights, + prefix: str, + block_sizes: Union[int, List[int]], + ): + try: + weight_packed = weights.get_packed_sharded( + f"{prefix}.weight_packed", dim=0, block_sizes=block_sizes + ).t() + except RuntimeError: + raise RuntimeError( + f"Cannot load w{self.weights.num_bits}a16 weight, make sure the model is already quantized" + ) + scales = weights.get_packed_sharded( + f"{prefix}.weight_scale", dim=0, block_sizes=block_sizes + ).t() + scales = scales.to(dtype=weights.dtype) + + zero_point = None + if not self.weights.symmetric: + zero_point = weights.get_packed_sharded( + f"{prefix}.qzeros", dim=0, block_sizes=block_sizes + ).t() + + g_idx = None + if self.desc_act: + g_idx = weights.get_tensor(f"{prefix}.g_idx") + + return repack_gptq_for_marlin( + qweight=weight_packed.contiguous(), + scales=scales, + qzeros=zero_point, + g_idx=g_idx, + bits=self.weights.num_bits, + desc_act=self.desc_act, + groupsize=self.groupsize, + quant_method="compressed-tensors", + sym=self.weights.symmetric, + sharded_infeatures=False, + ) + + def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int): + try: + weight_packed = torch.cat( + [ + weights.get_sharded(f"{p}.weight_packed", dim=0).t() + for p in prefixes + ], + dim=1, + ) + except RuntimeError: + raise RuntimeError( + f"Cannot load w{self.weights.num_bits}a16 weight, make sure the model is already quantized" + ) + + scales = torch.cat( + [weights.get_sharded(f"{p}.weight_scale", dim=0).t() for p in prefixes], + dim=1, + ) + + zero_point = None + if not self.weights.symmetric: + zero_point = torch.cat( + [weights.get_sharded(f"{p}.qzeros", dim=0).t() for p in prefixes], dim=1 + ).t() + + g_idx = None + if self.desc_act: + w = [weights.get_tensor(f"{p}.g_idx") for p in prefixes] + for w2 in w[1:]: + torch.testing.assert_close(w2, w[0]) + g_idx = w[0] + + return repack_gptq_for_marlin( + qweight=weight_packed.contiguous(), + scales=scales, + qzeros=zero_point, + g_idx=g_idx, + bits=self.weights.num_bits, + desc_act=self.desc_act, + groupsize=self.groupsize, + quant_method="compressed-tensors", + sym=self.weights.symmetric, + sharded_infeatures=False, + ) + + def get_weights_row(self, weights: Weights, prefix: str): + log_once(logger.info, "Using GPTQ-Marlin kernels") + try: + weight_packed = weights.get_sharded(f"{prefix}.weight_packed", dim=1).t() + except RuntimeError: + raise RuntimeError( + f"Cannot load `{self.quantize}` weight, make sure the model is already quantized." + ) + + zero_point = None + if not self.weights.symmetric: + if self.desc_act or self.groupsize == -1: + zero_point = weights.get_tensor(f"{prefix}.weight_zero_point").t() + else: + zero_point = weights.get_sharded( + f"{prefix}.weight_zero_point", dim=1 + ).t() + + g_idx = None + if self.desc_act: + g_idx = weights.get_sharded(f"{prefix}.g_idx", dim=0) + + if self.desc_act or self.groupsize == -1: + scales = weights.get_tensor(f"{prefix}.weight_scale").t() + else: + scales = weights.get_sharded(f"{prefix}.weight_scale", dim=1).t() + + sharded_in_features = weights.process_group.size() > 1 + + return repack_gptq_for_marlin( + qweight=weight_packed.contiguous(), + scales=scales, + qzeros=zero_point, + g_idx=g_idx, + bits=self.weights.num_bits, + desc_act=self.desc_act, + groupsize=self.groupsize, + quant_method="compressed-tensors", + sym=self.weights.symmetric, + sharded_infeatures=sharded_in_features, + ) diff --git a/server/text_generation_server/layers/fp8.py b/server/text_generation_server/layers/fp8.py index 21688173..1e5c8b3d 100644 --- a/server/text_generation_server/layers/fp8.py +++ b/server/text_generation_server/layers/fp8.py @@ -29,7 +29,7 @@ else: CUTLASS_FP8_AVAILABLE = False -def get_fp8_linear() -> Type[torch.nn.Module]: +def get_fp8_linear(force_w8a16: bool = False) -> Type[torch.nn.Module]: """ Return an FP8 linear `Module` that is compatible with the current system. """ @@ -37,7 +37,14 @@ def get_fp8_linear() -> Type[torch.nn.Module]: if SYSTEM == "cuda": major, _ = torch.cuda.get_device_capability() - if major == 8 and os.getenv("USE_CUTLASS_W8A8", "0") != "1": + # Marlin is W8A16, use it when: + # + # - On capability 8.x where x < 8: W8A8 FP8 GEMM is not supported. + # - On capability 8.9: W8A8 FP8 GEMM is supported, but Marlin-FP8 is faster. + # - On capability 9.x when force_w8a16: cutlass kernels do not support W8A16. + if (major == 8 or (major == 9 and force_w8a16)) and os.getenv( + "USE_CUTLASS_W8A8", "0" + ) != "1": # NOTE: Capability 8.9 is supported by cutlass kernels, but FP8-Marlin # gives better decoding throughput on L4 and L40. from text_generation_server.layers.marlin import GPTQMarlinFP8Linear @@ -283,14 +290,17 @@ class Fp8Weight(Weight): weight_scale: Optional[torch.Tensor] = None input_scale: Optional[torch.Tensor] = None activation_scale_ub: Optional[float] = None + force_w8a16: bool = False def get_linear(self, bias: torch.Tensor): if self.weight_scale is None: - return get_fp8_linear().from_unquant(self.weight, bias, self.dtype) + return get_fp8_linear(force_w8a16=self.force_w8a16).from_unquant( + self.weight, bias, self.dtype + ) # This is not checked by the fbgemm kernels, but they require contiguous # memory. Can be non-contiguous when we e.g. expand from scalars. self.weight_scale = self.weight_scale.contiguous() - return get_fp8_linear().from_fp8( + return get_fp8_linear(force_w8a16=self.force_w8a16).from_fp8( weight=self.weight, scale=self.weight_scale, dtype=self.dtype, diff --git a/server/text_generation_server/layers/marlin/gptq.py b/server/text_generation_server/layers/marlin/gptq.py index 47341c0f..5c1bb549 100644 --- a/server/text_generation_server/layers/marlin/gptq.py +++ b/server/text_generation_server/layers/marlin/gptq.py @@ -261,7 +261,7 @@ class GPTQMarlinWeight(Weight): def __post_init__(self): assert self.qweight.dtype == torch.int32 - assert self.scales.dtype == torch.float16 + assert self.scales.dtype in (torch.float16, torch.bfloat16) assert self.g_idx.dtype == torch.int32 assert self.perm.dtype == torch.int32 @@ -300,7 +300,7 @@ def repack_gptq_for_marlin( raise RuntimeError( f"Repacking GPTQ weights with group size {groupsize} as Marlin is not supported, must be one of: {supported_sizes}" ) - if not (sym or quant_method == "awq"): + if not (sym or quant_method == "awq" or quant_method == "compressed-tensors"): raise RuntimeError( "Repacking GPTQ weights with asymmetric quantization as Marlin is not supported." ) diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py index 6c633521..63534145 100644 --- a/server/text_generation_server/models/__init__.py +++ b/server/text_generation_server/models/__init__.py @@ -370,46 +370,23 @@ def get_model( compression_config = config_dict.get("compression_config", None) if quantization_config is not None and quantize is None: method = quantization_config.get("quant_method", None) - config_groups = quantization_config.get("config_groups", None) if method in {"gptq", "awq", "exl2"}: log_master(logger.info, f"Auto selecting quantization method {method}") quantize = method elif method == "fbgemm_fp8" or method == "fp8": log_master(logger.info, "Auto selecting quantization method fp8") quantize = "fp8" - elif config_groups is not None: - # TODO: at some point we should probably fully parse the compression - # configuration to know which parameters are compressed. - for _, group in config_groups.items(): - weights_config = group.get("weights") - if weights_config is not None: - if ( - weights_config["type"] == "float" - and weights_config["num_bits"] == 8 - ): - log_master( - logger.info, "Auto selecting quantization method fp8" - ) - quantize = "fp8" - break + if method == "compressed-tensors": + log_master( + logger.info, "Auto selecting quantization method compressed-tensors" + ) + quantize = "compressed-tensors" else: log_master(logger.warning, f"Unknown quantization method {method}") elif compression_config is not None: # `compression_config` renamed to `quantization_config`; support retained for backward compatibility. - config_groups = compression_config.get("config_groups") - if config_groups is not None: - for _, group in config_groups.items(): - weights_config = group.get("weights") - if weights_config is not None: - if ( - weights_config["type"] == "float" - and weights_config["num_bits"] == 8 - ): - log_master( - logger.info, "Auto selecting quantization method fp8" - ) - quantize = "fp8" - break + log_master(logger.info, "Auto selecting quantization method compressed-tensors") + quantize = "compressed-tensors" if dtype is None: if quantize in ["awq", "exl2", "gptq", "marlin"]: diff --git a/server/text_generation_server/utils/quantization.py b/server/text_generation_server/utils/quantization.py index ee561acc..0d894939 100644 --- a/server/text_generation_server/utils/quantization.py +++ b/server/text_generation_server/utils/quantization.py @@ -27,7 +27,20 @@ class _FP8QuantizerConfig: activation_scale_ub: float -# We should probably do this with Pytantic JSON deserialization, +def _get_config_json(model_id: str, revision: Optional[str], filename: str): + if os.path.exists( + os.path.join( + model_id, + ) + ): + filename = os.path.join(model_id, filename) + else: + filename = hf_hub_download(model_id, filename=filename, revision=revision) + with open(filename, "r") as f: + return json.load(f) + + +# We should probably do this with Pydantic JSON deserialization, # but for now we'll stay close to the old _set_gptq_params. def _get_quantizer_config(model_id, revision): bits = 4 @@ -39,12 +52,7 @@ def _get_quantizer_config(model_id, revision): filename = "config.json" try: - if os.path.exists(os.path.join(model_id, filename)): - filename = os.path.join(model_id, filename) - else: - filename = hf_hub_download(model_id, filename=filename, revision=revision) - with open(filename, "r") as f: - data = json.load(f) + data = _get_config_json(model_id, revision, filename) # FP8 config if data["quantization_config"]["quant_method"] == "fbgemm_fp8": @@ -67,14 +75,7 @@ def _get_quantizer_config(model_id, revision): except Exception: filename = "quantize_config.json" try: - if os.path.exists(os.path.join(model_id, filename)): - filename = os.path.join(model_id, filename) - else: - filename = hf_hub_download( - model_id, filename=filename, revision=revision - ) - with open(filename, "r") as f: - data = json.load(f) + data = _get_config_json(model_id, revision, filename) bits = data["bits"] groupsize = data["group_size"] @@ -90,14 +91,7 @@ def _get_quantizer_config(model_id, revision): except Exception: filename = "quant_config.json" try: - if os.path.exists(os.path.join(model_id, filename)): - filename = os.path.join(model_id, filename) - else: - filename = hf_hub_download( - model_id, filename=filename, revision=revision - ) - with open(filename, "r") as f: - data = json.load(f) + data = _get_config_json(model_id, revision, filename) bits = data["w_bit"] groupsize = data["q_group_size"] desc_act = data["desc_act"] @@ -119,6 +113,14 @@ def _get_quantizer_config(model_id, revision): def get_loader( quantize: Optional[str], model_id: str, revision: Optional[str] ) -> WeightsLoader: + if quantize == "compressed-tensors": + config = _get_config_json(model_id, revision, "config.json") + from text_generation_server.layers.compressed_tensors import ( + CompressedTensorsLoader, + ) + + return CompressedTensorsLoader(config) + quantizer_config = _get_quantizer_config(model_id, revision) if quantize in {"awq", "gptq"}: from text_generation_server.layers.gptq import GPTQWeightsLoader From ca4f46ddfc53fcc40200fe570c2b1232fa00c43b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Thu, 14 Nov 2024 18:48:20 +0100 Subject: [PATCH 03/11] nix: update nixpkgs (#2746) Updates from Triton 2.1.0 to 3.1.0 (among other things). --- flake.lock | 14 +++++++------- flake.nix | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/flake.lock b/flake.lock index c5515ae2..dfbd7f0e 100644 --- a/flake.lock +++ b/flake.lock @@ -718,11 +718,11 @@ }, "nixpkgs_6": { "locked": { - "lastModified": 1727675176, - "narHash": "sha256-xIjBFMYldWvj+g8ahxMPofsj+OqxvKJN6YylNHQ7gn4=", + "lastModified": 1731562571, + "narHash": "sha256-9V0C/H6NL2Vk3Y76msqNA8TgwZ6Ge4frOVawTNFJQmM=", "owner": "nixos", "repo": "nixpkgs", - "rev": "a6d0207fea9212d28cd3d487efe6bc699663b93a", + "rev": "19d66fab291f90ce56d0479b128cc7a5271bf666", "type": "github" }, "original": { @@ -978,16 +978,16 @@ "nixpkgs": "nixpkgs_6" }, "locked": { - "lastModified": 1730795478, - "narHash": "sha256-xpkXDKnkhXO4F6Ea3reHmqwXXRzQe2PsxdRQFPCViWs=", + "lastModified": 1731601436, + "narHash": "sha256-PJmXLyz06XnLG3wB5vRLgeJXoVvpuCx6c70khYv6J1o=", "owner": "huggingface", "repo": "text-generation-inference-nix", - "rev": "b7f6c07867d94d6e55f5352573a6b3dad1c88e56", + "rev": "9510f57282795d6e0dbbd163d2b77a6b5bb52566", "type": "github" }, "original": { "owner": "huggingface", - "ref": "compressed-tensors-0.7.1", + "ref": "nixpkgs-update-20241114", "repo": "text-generation-inference-nix", "type": "github" } diff --git a/flake.nix b/flake.nix index 1a1e6fe2..708ee65b 100644 --- a/flake.nix +++ b/flake.nix @@ -5,7 +5,7 @@ inputs.nixpkgs.follows = "tgi-nix/nixpkgs"; }; nix-filter.url = "github:numtide/nix-filter"; - tgi-nix.url = "github:huggingface/text-generation-inference-nix/compressed-tensors-0.7.1"; + tgi-nix.url = "github:huggingface/text-generation-inference-nix/nixpkgs-update-20241114"; nixpkgs.follows = "tgi-nix/nixpkgs"; flake-utils.url = "github:numtide/flake-utils"; rust-overlay = { From 8442f1ac850d642e0fc5c128f50aafd00b93ed80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Fri, 15 Nov 2024 13:14:55 +0100 Subject: [PATCH 04/11] benchmark: fix prefill throughput (#2741) --- benchmark/src/generation.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/src/generation.rs b/benchmark/src/generation.rs index 63fc7808..60d96f70 100644 --- a/benchmark/src/generation.rs +++ b/benchmark/src/generation.rs @@ -180,7 +180,7 @@ async fn prefill( let latency = start_time.elapsed(); // Compute throughput from latency and batch size - let throughput = batch_size as f64 / latency.as_secs_f64(); + let throughput = (batch_size * sequence_length) as f64 / latency.as_secs_f64(); // Decode batch cannot be empty let decode_batch = decode_batch.expect("decode_batch is None. This is a bug."); From f9ee46f740091c1b5a0825c2f1f743ba28b2b917 Mon Sep 17 00:00:00 2001 From: Billel Mokeddem Date: Fri, 15 Nov 2024 16:15:36 +0400 Subject: [PATCH 05/11] Fix: Change model_type from ssm to mamba (#2740) Co-authored-by: Ubuntu --- server/text_generation_server/models/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py index 63534145..c6e406c9 100644 --- a/server/text_generation_server/models/__init__.py +++ b/server/text_generation_server/models/__init__.py @@ -534,7 +534,7 @@ def get_model( # TODO: fix how we determine model type for Mamba if "ssm_cfg" in config_dict: # *only happens in Mamba case - model_type = "ssm" + model_type = "mamba" else: raise RuntimeError( f"Could not determine model type for {model_id} revision {revision}" From 4f4857a4ac4d09483f72465e5adcd29f38b03b16 Mon Sep 17 00:00:00 2001 From: Billel Mokeddem Date: Fri, 15 Nov 2024 16:16:15 +0400 Subject: [PATCH 06/11] Fix: Change embeddings to embedding (#2738) fix: change embeddings to embedding Co-authored-by: Ubuntu --- .../models/custom_modeling/mamba_modeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/text_generation_server/models/custom_modeling/mamba_modeling.py b/server/text_generation_server/models/custom_modeling/mamba_modeling.py index 07284e6a..5a9c0588 100644 --- a/server/text_generation_server/models/custom_modeling/mamba_modeling.py +++ b/server/text_generation_server/models/custom_modeling/mamba_modeling.py @@ -212,7 +212,7 @@ class MambaModel(nn.Module): try: self.lm_head = SpeculativeHead.load(config, f"{prefix}.embeddings", weights) except RuntimeError: - self.lm_head = SpeculativeHead.load(config, f"{prefix}.embeddings", weights) + self.lm_head = SpeculativeHead.load(config, f"{prefix}.embedding", weights) self.config = config def forward( From 003eaec0fbe00aacf03547b317163363cef56ab9 Mon Sep 17 00:00:00 2001 From: jito Date: Fri, 15 Nov 2024 21:21:50 +0900 Subject: [PATCH 07/11] fix response type of document for Text Generation Inference (#2743) Signed-off-by: jitokim --- docs/openapi.json | 5 ++++- router/src/server.rs | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/openapi.json b/docs/openapi.json index 22b06720..e4c8ffdb 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -36,7 +36,10 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/GenerateResponse" + "type": "array", + "items": { + "$ref": "#/components/schemas/GenerateResponse" + } } }, "text/event-stream": { diff --git a/router/src/server.rs b/router/src/server.rs index 2058bce3..a0bc1768 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -109,7 +109,7 @@ request_body = CompatGenerateRequest, responses( (status = 200, description = "Generated Text", content( -("application/json" = GenerateResponse), +("application/json" = Vec), ("text/event-stream" = StreamResponse), )), (status = 424, description = "Generation Error", body = ErrorResponse, From 4580ced091007ee110636ac559b78bc7c2b3b017 Mon Sep 17 00:00:00 2001 From: Alex Weston <43505988+aW3st@users.noreply.github.com> Date: Fri, 15 Nov 2024 07:22:52 -0500 Subject: [PATCH 08/11] Upgrade outlines to 0.1.1 (#2742) * Upgrade outlines to 0.1.1 * Update for new API * Check if allowed tokens is None --------- Co-authored-by: Nicolas Patry --- server/poetry.lock | 170 +++++++++--------- server/pyproject.toml | 2 +- .../utils/logits_process.py | 18 +- 3 files changed, 93 insertions(+), 97 deletions(-) diff --git a/server/poetry.lock b/server/poetry.lock index d5b84de3..ad7dab18 100644 --- a/server/poetry.lock +++ b/server/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "accelerate" @@ -167,6 +167,17 @@ files = [ [package.dependencies] frozenlist = ">=1.1.0" +[[package]] +name = "airportsdata" +version = "20241001" +description = "Extensive database of location and timezone data for nearly every airport and landing strip in the world." +optional = true +python-versions = ">=3.9" +files = [ + {file = "airportsdata-20241001-py3-none-any.whl", hash = "sha256:67d71cf2c5378cc17ff66b62b1e11aa2444043949c894543ac8fd8dafce192fd"}, + {file = "airportsdata-20241001.tar.gz", hash = "sha256:fa0bd143b4f4be3557cb892fa0612ef210fd91a92bd720b4d8221de576a4fa00"}, +] + [[package]] name = "annotated-types" version = "0.7.0" @@ -1043,17 +1054,6 @@ MarkupSafe = ">=2.0" [package.extras] i18n = ["Babel (>=2.7)"] -[[package]] -name = "joblib" -version = "1.4.2" -description = "Lightweight pipelining with Python functions" -optional = true -python-versions = ">=3.8" -files = [ - {file = "joblib-1.4.2-py3-none-any.whl", hash = "sha256:06d478d5674cbc267e7496a410ee875abd68e4340feff4490bcb7afb88060ae6"}, - {file = "joblib-1.4.2.tar.gz", hash = "sha256:2382c5816b2636fbd20a09e0f4e9dad4736765fdfb7dca582943b9c1366b3f0e"}, -] - [[package]] name = "jsonschema" version = "4.23.0" @@ -1106,36 +1106,6 @@ interegular = ["interegular (>=0.3.1,<0.4.0)"] nearley = ["js2py"] regex = ["regex"] -[[package]] -name = "llvmlite" -version = "0.43.0" -description = "lightweight wrapper around basic LLVM functionality" -optional = true -python-versions = ">=3.9" -files = [ - {file = "llvmlite-0.43.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a289af9a1687c6cf463478f0fa8e8aa3b6fb813317b0d70bf1ed0759eab6f761"}, - {file = "llvmlite-0.43.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6d4fd101f571a31acb1559ae1af30f30b1dc4b3186669f92ad780e17c81e91bc"}, - {file = "llvmlite-0.43.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7d434ec7e2ce3cc8f452d1cd9a28591745de022f931d67be688a737320dfcead"}, - {file = "llvmlite-0.43.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6912a87782acdff6eb8bf01675ed01d60ca1f2551f8176a300a886f09e836a6a"}, - {file = "llvmlite-0.43.0-cp310-cp310-win_amd64.whl", hash = "sha256:14f0e4bf2fd2d9a75a3534111e8ebeb08eda2f33e9bdd6dfa13282afacdde0ed"}, - {file = "llvmlite-0.43.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3e8d0618cb9bfe40ac38a9633f2493d4d4e9fcc2f438d39a4e854f39cc0f5f98"}, - {file = "llvmlite-0.43.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e0a9a1a39d4bf3517f2af9d23d479b4175ead205c592ceeb8b89af48a327ea57"}, - {file = "llvmlite-0.43.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c1da416ab53e4f7f3bc8d4eeba36d801cc1894b9fbfbf2022b29b6bad34a7df2"}, - {file = "llvmlite-0.43.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:977525a1e5f4059316b183fb4fd34fa858c9eade31f165427a3977c95e3ee749"}, - {file = "llvmlite-0.43.0-cp311-cp311-win_amd64.whl", hash = "sha256:d5bd550001d26450bd90777736c69d68c487d17bf371438f975229b2b8241a91"}, - {file = "llvmlite-0.43.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f99b600aa7f65235a5a05d0b9a9f31150c390f31261f2a0ba678e26823ec38f7"}, - {file = "llvmlite-0.43.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:35d80d61d0cda2d767f72de99450766250560399edc309da16937b93d3b676e7"}, - {file = "llvmlite-0.43.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eccce86bba940bae0d8d48ed925f21dbb813519169246e2ab292b5092aba121f"}, - {file = "llvmlite-0.43.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:df6509e1507ca0760787a199d19439cc887bfd82226f5af746d6977bd9f66844"}, - {file = "llvmlite-0.43.0-cp312-cp312-win_amd64.whl", hash = "sha256:7a2872ee80dcf6b5dbdc838763d26554c2a18aa833d31a2635bff16aafefb9c9"}, - {file = "llvmlite-0.43.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9cd2a7376f7b3367019b664c21f0c61766219faa3b03731113ead75107f3b66c"}, - {file = "llvmlite-0.43.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:18e9953c748b105668487b7c81a3e97b046d8abf95c4ddc0cd3c94f4e4651ae8"}, - {file = "llvmlite-0.43.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:74937acd22dc11b33946b67dca7680e6d103d6e90eeaaaf932603bec6fe7b03a"}, - {file = "llvmlite-0.43.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc9efc739cc6ed760f795806f67889923f7274276f0eb45092a1473e40d9b867"}, - {file = "llvmlite-0.43.0-cp39-cp39-win_amd64.whl", hash = "sha256:47e147cdda9037f94b399bf03bfd8a6b6b1f2f90be94a454e3386f006455a9b4"}, - {file = "llvmlite-0.43.0.tar.gz", hash = "sha256:ae2b5b5c3ef67354824fb75517c8db5fbe93bc02cd9671f3c62271626bc041d5"}, -] - [[package]] name = "loguru" version = "0.6.0" @@ -1577,40 +1547,6 @@ doc = ["nb2plots (>=0.7)", "nbconvert (<7.9)", "numpydoc (>=1.6)", "pillow (>=9. extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.11)", "sympy (>=1.10)"] test = ["pytest (>=7.2)", "pytest-cov (>=4.0)"] -[[package]] -name = "numba" -version = "0.60.0" -description = "compiling Python code using LLVM" -optional = true -python-versions = ">=3.9" -files = [ - {file = "numba-0.60.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5d761de835cd38fb400d2c26bb103a2726f548dc30368853121d66201672e651"}, - {file = "numba-0.60.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:159e618ef213fba758837f9837fb402bbe65326e60ba0633dbe6c7f274d42c1b"}, - {file = "numba-0.60.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1527dc578b95c7c4ff248792ec33d097ba6bef9eda466c948b68dfc995c25781"}, - {file = "numba-0.60.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fe0b28abb8d70f8160798f4de9d486143200f34458d34c4a214114e445d7124e"}, - {file = "numba-0.60.0-cp310-cp310-win_amd64.whl", hash = "sha256:19407ced081d7e2e4b8d8c36aa57b7452e0283871c296e12d798852bc7d7f198"}, - {file = "numba-0.60.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a17b70fc9e380ee29c42717e8cc0bfaa5556c416d94f9aa96ba13acb41bdece8"}, - {file = "numba-0.60.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3fb02b344a2a80efa6f677aa5c40cd5dd452e1b35f8d1c2af0dfd9ada9978e4b"}, - {file = "numba-0.60.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5f4fde652ea604ea3c86508a3fb31556a6157b2c76c8b51b1d45eb40c8598703"}, - {file = "numba-0.60.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4142d7ac0210cc86432b818338a2bc368dc773a2f5cf1e32ff7c5b378bd63ee8"}, - {file = "numba-0.60.0-cp311-cp311-win_amd64.whl", hash = "sha256:cac02c041e9b5bc8cf8f2034ff6f0dbafccd1ae9590dc146b3a02a45e53af4e2"}, - {file = "numba-0.60.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d7da4098db31182fc5ffe4bc42c6f24cd7d1cb8a14b59fd755bfee32e34b8404"}, - {file = "numba-0.60.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:38d6ea4c1f56417076ecf8fc327c831ae793282e0ff51080c5094cb726507b1c"}, - {file = "numba-0.60.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:62908d29fb6a3229c242e981ca27e32a6e606cc253fc9e8faeb0e48760de241e"}, - {file = "numba-0.60.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0ebaa91538e996f708f1ab30ef4d3ddc344b64b5227b67a57aa74f401bb68b9d"}, - {file = "numba-0.60.0-cp312-cp312-win_amd64.whl", hash = "sha256:f75262e8fe7fa96db1dca93d53a194a38c46da28b112b8a4aca168f0df860347"}, - {file = "numba-0.60.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:01ef4cd7d83abe087d644eaa3d95831b777aa21d441a23703d649e06b8e06b74"}, - {file = "numba-0.60.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:819a3dfd4630d95fd574036f99e47212a1af41cbcb019bf8afac63ff56834449"}, - {file = "numba-0.60.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0b983bd6ad82fe868493012487f34eae8bf7dd94654951404114f23c3466d34b"}, - {file = "numba-0.60.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c151748cd269ddeab66334bd754817ffc0cabd9433acb0f551697e5151917d25"}, - {file = "numba-0.60.0-cp39-cp39-win_amd64.whl", hash = "sha256:3031547a015710140e8c87226b4cfe927cac199835e5bf7d4fe5cb64e814e3ab"}, - {file = "numba-0.60.0.tar.gz", hash = "sha256:5df6158e5584eece5fc83294b949fd30b9f1125df7708862205217e068aabf16"}, -] - -[package.dependencies] -llvmlite = "==0.43.*" -numpy = ">=1.22,<2.1" - [[package]] name = "numpy" version = "1.26.4" @@ -1988,36 +1924,83 @@ opentelemetry-api = "1.25.0" [[package]] name = "outlines" -version = "0.0.34" +version = "0.1.1" description = "Probabilistic Generative Model Programming" optional = true -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "outlines-0.0.34-py3-none-any.whl", hash = "sha256:911588a7e64a4f193b97fb4c501d98ccfd4e95a98f6a3ada67a280bf0c373c50"}, - {file = "outlines-0.0.34.tar.gz", hash = "sha256:594e7204c770b47a62eb5c2ba7d25ea0ab2e16882b5f04556712a0228d3d3309"}, + {file = "outlines-0.1.1-py3-none-any.whl", hash = "sha256:896aee7f8f0472955104bb30fb118e525bced6885f09e833bb848782394f2c17"}, + {file = "outlines-0.1.1.tar.gz", hash = "sha256:9c5d3524ef21343bd681757e8ed9a5b1fcb335ee68f9b6b0889062ce23b561fc"}, ] [package.dependencies] +airportsdata = "*" cloudpickle = "*" +datasets = "*" diskcache = "*" interegular = "*" jinja2 = "*" -joblib = "*" jsonschema = "*" lark = "*" nest-asyncio = "*" -numba = "*" -numpy = "*" +numpy = "<2.0.0" +outlines-core = "0.1.14" +pycountry = "*" pydantic = ">=2.0" referencing = "*" requests = "*" -scipy = "*" -torch = ">=2.1.0" -transformers = "*" +torch = "*" +tqdm = "*" +typing-extensions = "*" [package.extras] -serve = ["fastapi", "pydantic (>=2.0)", "ray (==2.9.0)", "uvicorn", "vllm (>=0.3.0)"] -test = ["accelerate", "beartype (<0.16.0)", "coverage[toml] (>=5.1)", "datasets", "diff-cover", "huggingface-hub", "llama-cpp-python (>=0.2.42)", "pre-commit", "pytest", "pytest-benchmark", "pytest-cov", "pytest-mock", "responses", "transformers"] +serve = ["fastapi", "pydantic (>=2.0)", "uvicorn", "vllm (>=0.3.0)"] +test = ["accelerate", "beartype (<0.16.0)", "coverage[toml] (>=5.1)", "diff-cover", "exllamav2", "huggingface-hub", "llama-cpp-python", "mlx-lm", "openai (>=1.0.0)", "pillow", "pre-commit", "pytest", "pytest-benchmark", "pytest-cov", "pytest-mock", "responses", "transformers", "vllm"] + +[[package]] +name = "outlines-core" +version = "0.1.14" +description = "Structured Text Generation in Rust" +optional = true +python-versions = ">=3.8" +files = [ + {file = "outlines_core-0.1.14-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:291c6d9d348cb5562cd28ce44d80822d77238f1cd7c30d890b5b20488e71608d"}, + {file = "outlines_core-0.1.14-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3a50e2f6745e0c34cc857d1bd5590e2966ad06e8ce10802976e9e6c116c7533d"}, + {file = "outlines_core-0.1.14-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7dfe64b590a6a88dcc5e59f0a399fff0458cdcf97d68de07f08e1bd3bf8ac1d"}, + {file = "outlines_core-0.1.14-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:100de068ce52893bec316481e65db8f1c734a0f25f540c29dafd7a8afec0a29d"}, + {file = "outlines_core-0.1.14-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:e06cb724770fd0fe1c8444382c4a6e79901bba33720f70fe6c8437f58eceb92e"}, + {file = "outlines_core-0.1.14-cp310-cp310-win32.whl", hash = "sha256:6d41da3d8a087fd54133cf910c2d5759da55490bbd0e3bc6c1e7907b54248415"}, + {file = "outlines_core-0.1.14-cp310-cp310-win_amd64.whl", hash = "sha256:646fd1073feed393bc77f9605a2fa27a54551ab04f85867ce789af1dee6326fa"}, + {file = "outlines_core-0.1.14-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:60f3a947fe09106f7668cf832c28b9269b8f0fc109f081608acfce9262213359"}, + {file = "outlines_core-0.1.14-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5e273a100c922f794d8e077a8161d0985d3005887066b4af3ae7afd3742fe9b8"}, + {file = "outlines_core-0.1.14-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:622e547f11a869fc67be40abc4cbcda89ae6f46f9eb46a1ec0666bd6807e0c67"}, + {file = "outlines_core-0.1.14-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:60c9933a9faaa51b39aea3518f1822b0d3ec2c9a13b16849caca3955e29e320d"}, + {file = "outlines_core-0.1.14-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4a8c616ce103ef9102dbf4326f67b03e1e0f46aa19351e57f4beb37588c00428"}, + {file = "outlines_core-0.1.14-cp311-cp311-win32.whl", hash = "sha256:1c77aaa4556cbb6e93cc42be0a6e262f175e0754b7694d702d642ff03df67f2c"}, + {file = "outlines_core-0.1.14-cp311-cp311-win_amd64.whl", hash = "sha256:eb6ffe410866f65dbe17e95b0aabd70d990f058a2dc4e8b74f9583b07248cd36"}, + {file = "outlines_core-0.1.14-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b0e408b033618f23e9bb928a47b33b1bd4c9d04a3dbec680a20977de3b4f590d"}, + {file = "outlines_core-0.1.14-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:21d1393a6da5d3320e8c8247e9deeb851c5c862fd6ea5c779bd29797e8987155"}, + {file = "outlines_core-0.1.14-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5829c568db76673d36caaf0f86e96748b491b4a209deb9be87617372394a5fb9"}, + {file = "outlines_core-0.1.14-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7e855ec99bce1099c0755bcbfa44568adf7ae0083905ba04f58a17614ddf0fe7"}, + {file = "outlines_core-0.1.14-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b897cfbf9c2719aa011d9b439b4c6751d9c7df5683b2169617972d4b4a914403"}, + {file = "outlines_core-0.1.14-cp38-cp38-win32.whl", hash = "sha256:4c9d908004b31bcd432156d60f4895bf5e1b51ca8c8eed82b12f1bb57d5bf7fd"}, + {file = "outlines_core-0.1.14-cp38-cp38-win_amd64.whl", hash = "sha256:6668a930d928216d0b319ad84947903f1e27556f604a9743051f795b11008b64"}, + {file = "outlines_core-0.1.14-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b745aa469cf3fb347b79a257804d75d1324e01691158664c1e413a816ce6b98d"}, + {file = "outlines_core-0.1.14-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:27504c8360467429d6223ebc49180d6956d7418bfc3d324f6ad10f069e1813ad"}, + {file = "outlines_core-0.1.14-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd8f1e1d91a206a520d1c577ce00136de2beb1d200ef93759fd4c9f45abe24d3"}, + {file = "outlines_core-0.1.14-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f30c8acb42895b624c504b85678331c5f9376fa4b8069ce06a27cf80f5881e27"}, + {file = "outlines_core-0.1.14-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:0e6cd0e7d995a7b04d90139a695279ab4a9eb7f492618b2c037a85eaf5f9fc59"}, + {file = "outlines_core-0.1.14-cp39-cp39-win32.whl", hash = "sha256:3104af4084da0e7c3d4b8538b43c725581d66bb68d426bc389680f06c3667476"}, + {file = "outlines_core-0.1.14-cp39-cp39-win_amd64.whl", hash = "sha256:45c6b9baded0337c4dcfa156af05ec4efd2b25c4d976e77be28146e4037b991f"}, + {file = "outlines_core-0.1.14.tar.gz", hash = "sha256:6db033e4f8e48381164e36cc716746640ad5022f0d86e4c88af15c75886b93a4"}, +] + +[package.dependencies] +interegular = "*" +jsonschema = "*" + +[package.extras] +test = ["accelerate", "asv", "beartype (<0.16.0)", "coverage[toml] (>=5.1)", "datasets", "diff-cover", "huggingface-hub", "pillow", "pre-commit", "pydantic", "pytest", "pytest-benchmark", "pytest-cov", "pytest-mock", "setuptools-rust", "torch", "transformers"] [[package]] name = "packaging" @@ -2490,6 +2473,17 @@ numpy = ">=1.16.6" [package.extras] test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"] +[[package]] +name = "pycountry" +version = "24.6.1" +description = "ISO country, subdivision, language, currency and script definitions and their translations" +optional = true +python-versions = ">=3.8" +files = [ + {file = "pycountry-24.6.1-py3-none-any.whl", hash = "sha256:f1a4fb391cd7214f8eefd39556d740adcc233c778a27f8942c8dca351d6ce06f"}, + {file = "pycountry-24.6.1.tar.gz", hash = "sha256:b61b3faccea67f87d10c1f2b0fc0be714409e8fcdcc1315613174f6466c10221"}, +] + [[package]] name = "pydantic" version = "2.9.2" diff --git a/server/pyproject.toml b/server/pyproject.toml index 91ddfd6c..ca65b8c8 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -34,7 +34,7 @@ peft = { version = "^0.10", optional = true } torch = { version = "^2.4.0", optional = true } scipy = "^1.11.1" pillow = "^10.0.0" -outlines= { version = "^0.0.34", optional = true } +outlines= { version = "^0.1.1", optional = true } prometheus-client = "^0.20.0" py-cpuinfo = "^9.0.0" compressed-tensors = { version = "^0.7.1", optional = true } diff --git a/server/text_generation_server/utils/logits_process.py b/server/text_generation_server/utils/logits_process.py index 9abd886f..ec2813a1 100644 --- a/server/text_generation_server/utils/logits_process.py +++ b/server/text_generation_server/utils/logits_process.py @@ -5,7 +5,7 @@ from loguru import logger from typing import Dict, Union from text_generation_server.pb.generate_pb2 import GrammarType -from outlines.fsm.fsm import RegexFSM +from outlines.fsm.guide import RegexGuide from outlines.fsm.json_schema import build_regex_from_schema from functools import lru_cache from typing import List, Optional, DefaultDict @@ -482,7 +482,7 @@ class HeterogeneousProcessorWrapper(LogitsProcessor): class GrammarLogitProcessor(LogitsProcessor): fsm_state: DefaultDict[int, int] - fsm: RegexFSM + fsm: RegexGuide def __init__(self, tokenizer, device, grammar, grammar_type): self.device = device @@ -498,9 +498,10 @@ class GrammarLogitProcessor(LogitsProcessor): ): if fsm_grammar_state == -1 or self.fsm is None: return logits - allowed_tokens = self.fsm.allowed_token_ids(fsm_grammar_state) + allowed_tokens = self.fsm.get_next_instruction(fsm_grammar_state).tokens mask = torch.full_like(logits, -math.inf) - mask[:, allowed_tokens] = 0 + if allowed_tokens is not None: + mask[:, allowed_tokens] = 0 biased_scores = logits + mask return biased_scores @@ -513,7 +514,7 @@ class GrammarLogitProcessor(LogitsProcessor): def _advance(next_token_id, fsm_grammar_state, fsm): if fsm_grammar_state == -1: return fsm_grammar_state - return fsm.next_state(fsm_grammar_state, next_token_id) + return fsm.get_next_state(fsm_grammar_state, next_token_id) # TODO: move grammar compilation into the router @staticmethod @@ -530,7 +531,7 @@ class GrammarLogitProcessor(LogitsProcessor): schema = "(.*?)" elif grammar_type == GrammarType.GRAMMAR_TYPE_REGEX: pass # schema is already a regex just here for clarity - fsm = RegexFSM(schema, tokenizer) + fsm = RegexGuide.from_regex(schema, tokenizer) logger.debug(f"Compiled FSM in {time.time() - start_time:.2f}s") return fsm @@ -588,8 +589,9 @@ class HeterogeneousGrammarLogitProcessor(LogitsProcessor): fsm = self.fsms[i] if fsm_grammar_states[i] == -1 or fsm is None: continue - allowed_tokens = fsm.allowed_token_ids(fsm_grammar_states[i]) - mask[i, allowed_tokens] = 0 + allowed_tokens = fsm.get_next_instruction(fsm_grammar_states[i]).tokens + if allowed_tokens is not None: + mask[i, allowed_tokens] = 0 logits[i] += mask[i] return logits From 34a3bdedc344da762edb173d5c842f5e5790b202 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Fri, 15 Nov 2024 21:03:27 +0800 Subject: [PATCH 09/11] Upgrading our deps. (#2750) * Upgrading our deps. * fixup. * Fixup. --- server/poetry.lock | 11 ++++++----- server/requirements_cuda.txt | 2 +- server/requirements_intel.txt | 2 +- server/requirements_rocm.txt | 2 +- server/text_generation_server/utils/logits_process.py | 2 +- 5 files changed, 10 insertions(+), 9 deletions(-) diff --git a/server/poetry.lock b/server/poetry.lock index ad7dab18..d03d03ae 100644 --- a/server/poetry.lock +++ b/server/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. [[package]] name = "accelerate" @@ -1924,13 +1924,13 @@ opentelemetry-api = "1.25.0" [[package]] name = "outlines" -version = "0.1.1" +version = "0.1.3" description = "Probabilistic Generative Model Programming" optional = true python-versions = ">=3.9" files = [ - {file = "outlines-0.1.1-py3-none-any.whl", hash = "sha256:896aee7f8f0472955104bb30fb118e525bced6885f09e833bb848782394f2c17"}, - {file = "outlines-0.1.1.tar.gz", hash = "sha256:9c5d3524ef21343bd681757e8ed9a5b1fcb335ee68f9b6b0889062ce23b561fc"}, + {file = "outlines-0.1.3-py3-none-any.whl", hash = "sha256:afcf6012b7cabbaae4a58975d03190c0bbc3d402b0b2a37538e05f335d73a247"}, + {file = "outlines-0.1.3.tar.gz", hash = "sha256:5a48ad00d3bdd8eccaa7574821eb5aaa27ab9f61fde9c3fba52f352dc00197e4"}, ] [package.dependencies] @@ -3986,6 +3986,7 @@ type = ["pytest-mypy"] [extras] accelerate = ["accelerate"] bnb = ["bitsandbytes"] +compressed-tensors = ["compressed-tensors"] marlin = ["marlin-kernels", "marlin-kernels", "marlin-kernels", "marlin-kernels"] moe = ["moe-kernels", "moe-kernels", "moe-kernels", "moe-kernels"] outlines = ["outlines"] @@ -3996,4 +3997,4 @@ torch = ["torch"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.13" -content-hash = "4636689efd4c94559c3c23903aafcffd177533a3b9006b3b4f8491b158a3a754" +content-hash = "5d1295a8becce2f65dc68d64f200acb5832de50fc0c37392f6f87bbc5b15d32a" diff --git a/server/requirements_cuda.txt b/server/requirements_cuda.txt index e3f6d20f..ad4ea56b 100644 --- a/server/requirements_cuda.txt +++ b/server/requirements_cuda.txt @@ -45,7 +45,7 @@ sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13" setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13" tokenizers==0.20.1 ; python_version >= "3.9" and python_version < "3.13" tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13" -transformers==4.45.2 ; python_version >= "3.9" and python_version < "3.13" +transformers==4.46.0 ; python_version >= "3.9" and python_version < "3.13" typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13" typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13" urllib3==2.2.3 ; python_version >= "3.9" and python_version < "3.13" diff --git a/server/requirements_intel.txt b/server/requirements_intel.txt index e3f6d20f..ad4ea56b 100644 --- a/server/requirements_intel.txt +++ b/server/requirements_intel.txt @@ -45,7 +45,7 @@ sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13" setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13" tokenizers==0.20.1 ; python_version >= "3.9" and python_version < "3.13" tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13" -transformers==4.45.2 ; python_version >= "3.9" and python_version < "3.13" +transformers==4.46.0 ; python_version >= "3.9" and python_version < "3.13" typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13" typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13" urllib3==2.2.3 ; python_version >= "3.9" and python_version < "3.13" diff --git a/server/requirements_rocm.txt b/server/requirements_rocm.txt index e3f6d20f..ad4ea56b 100644 --- a/server/requirements_rocm.txt +++ b/server/requirements_rocm.txt @@ -45,7 +45,7 @@ sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13" setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13" tokenizers==0.20.1 ; python_version >= "3.9" and python_version < "3.13" tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13" -transformers==4.45.2 ; python_version >= "3.9" and python_version < "3.13" +transformers==4.46.0 ; python_version >= "3.9" and python_version < "3.13" typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13" typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13" urllib3==2.2.3 ; python_version >= "3.9" and python_version < "3.13" diff --git a/server/text_generation_server/utils/logits_process.py b/server/text_generation_server/utils/logits_process.py index ec2813a1..d53f070c 100644 --- a/server/text_generation_server/utils/logits_process.py +++ b/server/text_generation_server/utils/logits_process.py @@ -501,7 +501,7 @@ class GrammarLogitProcessor(LogitsProcessor): allowed_tokens = self.fsm.get_next_instruction(fsm_grammar_state).tokens mask = torch.full_like(logits, -math.inf) if allowed_tokens is not None: - mask[:, allowed_tokens] = 0 + mask[:, allowed_tokens] = 0 biased_scores = logits + mask return biased_scores From 6489f85269ffb91ab1c62c3b76964167206b850a Mon Sep 17 00:00:00 2001 From: drbh Date: Fri, 15 Nov 2024 08:49:19 -0500 Subject: [PATCH 10/11] feat: return streaming errors as an event formatted for openai's client (#2668) * feat: return streaming errors as an event formatted for openai's client * fix: propagate completions error events to stream * fix: improve stream api error format and add status code * fix: improve streamin error to include error_type * Revert "fix: improve streamin error to include error_type" This reverts commit 2b1a360b1511d94ea9a24e5432e498e67939506a. * Reworked the implementation. * Revert "Reworked the implementation." This reverts commit 7c3f29777f17411ae4ade57e2f88e73cde704ee5. * Small lifting. --------- Co-authored-by: Nicolas Patry --- router/src/infer/mod.rs | 24 ++++++++++++++++++++++++ router/src/server.rs | 7 +++++-- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/router/src/infer/mod.rs b/router/src/infer/mod.rs index 557e03cb..d3d6bc59 100644 --- a/router/src/infer/mod.rs +++ b/router/src/infer/mod.rs @@ -10,10 +10,12 @@ use crate::{ }; use async_stream::stream; use async_trait::async_trait; +use axum::response::sse::Event; use chat_template::ChatTemplate; use futures::future::try_join_all; use futures::Stream; use minijinja::ErrorKind; +use serde::Serialize; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; use thiserror::Error; @@ -373,4 +375,26 @@ impl InferError { InferError::StreamSerializationError(_) => "stream_serialization_error", } } + + pub(crate) fn into_openai_event(self) -> Event { + Event::default() + .json_data(OpenaiErrorEvent { + error: APIError { + message: self.to_string(), + http_status_code: 422, + }, + }) + .unwrap() + } +} + +#[derive(Serialize)] +pub struct APIError { + message: String, + http_status_code: usize, +} + +#[derive(Serialize)] +pub struct OpenaiErrorEvent { + error: APIError, } diff --git a/router/src/server.rs b/router/src/server.rs index a0bc1768..cbb04174 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -866,7 +866,7 @@ pub(crate) async fn completions( yield Ok(event); } - Err(err) => yield Ok(Event::from(err)), + Err(err) => yield Ok(err.into_openai_event()), } } }; @@ -1274,7 +1274,8 @@ pub(crate) async fn chat_completions( }; let mut response_as_tool = using_tools; while let Some(result) = response_stream.next().await { - if let Ok(stream_token) = result { + match result{ + Ok(stream_token) => { let token_text = &stream_token.token.text.clone(); match state { StreamState::Buffering => { @@ -1368,6 +1369,8 @@ pub(crate) async fn chat_completions( } } } + Err(err) => yield Ok(err.into_openai_event()) + } } yield Ok::(Event::default().data("[DONE]")); }; From 52e48739a57e29ba47c238b2bbf06a391066da57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Sun, 17 Nov 2024 17:34:50 +0100 Subject: [PATCH 11/11] Remove vLLM dependency for CUDA (#2751) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Remove vLLM dependency for CUDA This change adds `attention-kernels` as a dependency for paged attention and cache reshaping. With that, we don't use vLLM anywhere for CUDA. Tested run (since we don't have paged attention in CI): ``` ❯ ATTENTION=paged python -m pytest integration-tests -k "llama and awq" --release [...] 5 snapshots passed. ``` * Fix clippy warning --- Dockerfile | 16 +---- flake.lock | 7 +- flake.nix | 2 +- nix/server.nix | 12 +++- router/src/lib.rs | 1 + server/Makefile | 4 +- server/Makefile-vllm | 10 --- server/poetry.lock | 71 ++++++++++++++++++- server/pyproject.toml | 11 ++- .../layers/attention/cuda.py | 6 +- .../layers/attention/kv_cache.py | 6 +- .../custom_modeling/flash_dbrx_modeling.py | 4 +- 12 files changed, 106 insertions(+), 44 deletions(-) diff --git a/Dockerfile b/Dockerfile index 565f3779..0c08d48f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -161,18 +161,6 @@ COPY server/custom_kernels/ . # Build specific version of transformers RUN python setup.py build -# Build vllm CUDA kernels -FROM kernel-builder AS vllm-builder - -WORKDIR /usr/src - -ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" - -COPY server/Makefile-vllm Makefile - -# Build specific version of vllm -RUN make build-vllm-cuda - # Build mamba kernels FROM kernel-builder AS mamba-builder WORKDIR /usr/src @@ -230,8 +218,6 @@ COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86 COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages # Copy build artifacts from lorax punica kernels builder COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages -# Copy build artifacts from vllm builder -COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages # Copy build artifacts from mamba builder COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages @@ -247,7 +233,7 @@ COPY server/Makefile server/Makefile RUN cd server && \ make gen-server && \ pip install -r requirements_cuda.txt && \ - pip install ".[bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \ + pip install ".[attention, bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \ pip install nvidia-nccl-cu12==2.22.3 ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2 diff --git a/flake.lock b/flake.lock index dfbd7f0e..6d2ff5dc 100644 --- a/flake.lock +++ b/flake.lock @@ -978,16 +978,15 @@ "nixpkgs": "nixpkgs_6" }, "locked": { - "lastModified": 1731601436, - "narHash": "sha256-PJmXLyz06XnLG3wB5vRLgeJXoVvpuCx6c70khYv6J1o=", + "lastModified": 1731674227, + "narHash": "sha256-k/ur37KSc+RXcwwz0tgxeamz6wQ5rsOe5hMepzIdD2s=", "owner": "huggingface", "repo": "text-generation-inference-nix", - "rev": "9510f57282795d6e0dbbd163d2b77a6b5bb52566", + "rev": "407b9e22a0b7121bf6e171d67ce0144e3f3e39bf", "type": "github" }, "original": { "owner": "huggingface", - "ref": "nixpkgs-update-20241114", "repo": "text-generation-inference-nix", "type": "github" } diff --git a/flake.nix b/flake.nix index 708ee65b..f26a983e 100644 --- a/flake.nix +++ b/flake.nix @@ -5,7 +5,7 @@ inputs.nixpkgs.follows = "tgi-nix/nixpkgs"; }; nix-filter.url = "github:numtide/nix-filter"; - tgi-nix.url = "github:huggingface/text-generation-inference-nix/nixpkgs-update-20241114"; + tgi-nix.url = "github:huggingface/text-generation-inference-nix"; nixpkgs.follows = "tgi-nix/nixpkgs"; flake-utils.url = "github:numtide/flake-utils"; rust-overlay = { diff --git a/nix/server.nix b/nix/server.nix index a96e53ac..5903a65a 100644 --- a/nix/server.nix +++ b/nix/server.nix @@ -3,6 +3,7 @@ buildPythonPackage, poetry-core, mypy-protobuf, + attention-kernels, awq-inference-engine, causal-conv1d, compressed-tensors, @@ -27,15 +28,18 @@ opentelemetry-exporter-otlp, opentelemetry-instrumentation-grpc, opentelemetry-semantic-conventions, + outlines, peft, + prometheus-client, punica-kernels, + py-cpuinfo, + pydantic, safetensors, tokenizers, torch, sentencepiece, transformers, typer, - vllm, }: let @@ -72,6 +76,7 @@ buildPythonPackage { pythonRemoveDeps = [ "scipy" ]; dependencies = [ + attention-kernels awq-inference-engine eetq causal-conv1d @@ -95,14 +100,17 @@ buildPythonPackage { opentelemetry-exporter-otlp opentelemetry-instrumentation-grpc opentelemetry-semantic-conventions + outlines peft + prometheus-client punica-kernels + py-cpuinfo + pydantic safetensors sentencepiece tokenizers transformers typer - vllm ]; prePatch = '' diff --git a/router/src/lib.rs b/router/src/lib.rs index d9cacb91..c0155852 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -22,6 +22,7 @@ use tracing::warn; use utoipa::ToSchema; use validation::Validation; +#[allow(clippy::large_enum_variant)] #[derive(Clone)] pub enum Tokenizer { Python { diff --git a/server/Makefile b/server/Makefile index 5f9f9654..b5677db8 100644 --- a/server/Makefile +++ b/server/Makefile @@ -29,8 +29,8 @@ install-server: gen-server install: install-cuda echo "Installed server" -install-cuda: install-server install-flash-attention-v2-cuda install-vllm-cuda install-flash-attention - pip install -e ".[bnb,marlin,moe]" +install-cuda: install-server install-flash-attention-v2-cuda install-flash-attention + pip install -e ".[attention,bnb,marlin,moe]" pip install nvidia-nccl-cu12==2.22.3 install-rocm: install-server install-flash-attention-v2-rocm install-vllm-rocm diff --git a/server/Makefile-vllm b/server/Makefile-vllm index 18dcc4a0..45a7980d 100644 --- a/server/Makefile-vllm +++ b/server/Makefile-vllm @@ -1,14 +1,4 @@ -commit_cuda := d243e9dc7e2c9c2e36a4150ec8e64809cb55c01b commit_rocm := 4e0929e6e4fa0a3d09d358715c288020ea9dc247 -build-vllm-cuda: - if [ ! -d 'vllm' ]; then \ - pip install -U ninja packaging --no-cache-dir && \ - git clone https://github.com/Narsil/vllm.git vllm; \ - fi - cd vllm && git fetch origin && git checkout $(commit_cuda) && python setup.py build - -install-vllm-cuda: build-vllm-cuda - cd vllm && git fetch origin && git checkout $(commit_cuda) && pip install -e . build-vllm-rocm: if [ ! -d 'vllm' ]; then \ diff --git a/server/poetry.lock b/server/poetry.lock index d03d03ae..34656816 100644 --- a/server/poetry.lock +++ b/server/poetry.lock @@ -200,6 +200,74 @@ files = [ {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"}, ] +[[package]] +name = "attention-kernels" +version = "0.1.1" +description = "Attention kernels" +optional = true +python-versions = ">=3.8" +files = [ + {file = "attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:812851d4ce0f54ca764ff3815a731b15f0cb110115d0aa2d0997cd7794d808bb"}, +] + +[package.dependencies] +torch = "*" + +[package.source] +type = "url" +url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl" + +[[package]] +name = "attention-kernels" +version = "0.1.1" +description = "Attention kernels" +optional = true +python-versions = ">=3.8" +files = [ + {file = "attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:614c402621b11dd1f5741a016b9fd27cb6a68814471f2048bc05206923516268"}, +] + +[package.dependencies] +torch = "*" + +[package.source] +type = "url" +url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl" + +[[package]] +name = "attention-kernels" +version = "0.1.1" +description = "Attention kernels" +optional = true +python-versions = ">=3.8" +files = [ + {file = "attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:6b2ca7c98997431d5f6c4af7553dce6b1bff8dfdec374c97c6ffba71325a02b7"}, +] + +[package.dependencies] +torch = "*" + +[package.source] +type = "url" +url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl" + +[[package]] +name = "attention-kernels" +version = "0.1.1" +description = "Attention kernels" +optional = true +python-versions = ">=3.8" +files = [ + {file = "attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:a56710c5626e461d6f628ae14b74ffc89833578ebd59c3c0c47f5d6f07461fbf"}, +] + +[package.dependencies] +torch = "*" + +[package.source] +type = "url" +url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl" + [[package]] name = "attrs" version = "24.2.0" @@ -3985,6 +4053,7 @@ type = ["pytest-mypy"] [extras] accelerate = ["accelerate"] +attention = ["attention-kernels", "attention-kernels", "attention-kernels", "attention-kernels"] bnb = ["bitsandbytes"] compressed-tensors = ["compressed-tensors"] marlin = ["marlin-kernels", "marlin-kernels", "marlin-kernels", "marlin-kernels"] @@ -3997,4 +4066,4 @@ torch = ["torch"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.13" -content-hash = "5d1295a8becce2f65dc68d64f200acb5832de50fc0c37392f6f87bbc5b15d32a" +content-hash = "05add88628d836faceae1a26fde4092651a6eca74555ae38ebff879a7895be7e" diff --git a/server/pyproject.toml b/server/pyproject.toml index ca65b8c8..f039ca8a 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -9,7 +9,7 @@ text-generation-server = 'text_generation_server.cli:app' [tool.poetry.dependencies] python = ">=3.9,<3.13" -protobuf = "^4.25.3" +protobuf = ">=4.25.3,<6" grpcio = "^1.51.1" grpcio-status = "^1.51.1" grpcio-reflection = "^1.51.1" @@ -35,12 +35,18 @@ torch = { version = "^2.4.0", optional = true } scipy = "^1.11.1" pillow = "^10.0.0" outlines= { version = "^0.1.1", optional = true } -prometheus-client = "^0.20.0" +prometheus-client = ">=0.20.0,<0.22" py-cpuinfo = "^9.0.0" compressed-tensors = { version = "^0.7.1", optional = true } # Remove later, temporary workaround for outlines. numpy = "^1.26" +attention-kernels = [ + { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true }, + { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true }, + { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true }, + { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true }, +] marlin-kernels = [ { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true }, { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true }, @@ -58,6 +64,7 @@ rich = "^13.7.1" [tool.poetry.extras] torch = ["torch"] accelerate = ["accelerate"] +attention = ["attention-kernels"] bnb = ["bitsandbytes"] compressed-tensors = ["compressed-tensors"] marlin = ["marlin-kernels"] diff --git a/server/text_generation_server/layers/attention/cuda.py b/server/text_generation_server/layers/attention/cuda.py index d705afb0..3038602e 100644 --- a/server/text_generation_server/layers/attention/cuda.py +++ b/server/text_generation_server/layers/attention/cuda.py @@ -108,7 +108,7 @@ def paged_attention( if softcap is not None: raise RuntimeError("Paged attention doesn't support softcapping") input_lengths = seqlen.input_lengths + seqlen.cache_lengths - from vllm._C import ops + import attention_kernels out = torch.empty_like(query) @@ -116,7 +116,7 @@ def paged_attention( max_num_partitions == 1 or num_seqs * num_heads > 512 ) if use_v1: - ops.paged_attention_v1( + attention_kernels.paged_attention_v1( out, query, kv_cache.key, @@ -146,7 +146,7 @@ def paged_attention( ) max_logits = torch.empty_like(exp_sums) - ops.paged_attention_v2( + attention_kernels.paged_attention_v2( out, exp_sums, max_logits, diff --git a/server/text_generation_server/layers/attention/kv_cache.py b/server/text_generation_server/layers/attention/kv_cache.py index 9d739da5..cad1d98a 100644 --- a/server/text_generation_server/layers/attention/kv_cache.py +++ b/server/text_generation_server/layers/attention/kv_cache.py @@ -200,12 +200,12 @@ def paged_reshape_and_cache( ): if SYSTEM == "cuda": try: - from vllm._C import cache_ops + import attention_kernels except Exception as e: raise ImportError( - f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}" + f"Could not import attention_kernels. Make sure your installation is correct. Complete error: {e}" ) - cache_ops.reshape_and_cache( + attention_kernels.reshape_and_cache( key, value, key_cache, value_cache, slots, "auto", 1.0 ) elif SYSTEM == "rocm": diff --git a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py index f70bff4f..57118362 100644 --- a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py @@ -23,8 +23,10 @@ from typing import Optional, List, Tuple, Any from text_generation_server.layers.attention.kv_cache import get_kv_scales from text_generation_server.utils.import_utils import SYSTEM -if SYSTEM != "ipex": +if SYSTEM == "rocm": from vllm.model_executor.layers.fused_moe import fused_moe +elif SYSTEM != "ipex": + from moe_kernels.fused_moe import fused_moe from text_generation_server.layers.attention import ( paged_attention,