From 0da4df4b96800bced8bdcf5f3402ccc5b1db3202 Mon Sep 17 00:00:00 2001 From: Florian Zimmermeister Date: Mon, 7 Oct 2024 09:34:19 +0200 Subject: [PATCH] Fix FP8 KV-cache condition (#2611) Update kv_cache.py --- server/text_generation_server/layers/attention/kv_cache.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/text_generation_server/layers/attention/kv_cache.py b/server/text_generation_server/layers/attention/kv_cache.py index 80033122..ced4b5b4 100644 --- a/server/text_generation_server/layers/attention/kv_cache.py +++ b/server/text_generation_server/layers/attention/kv_cache.py @@ -26,8 +26,8 @@ class KVCache: if ( dtype == torch.float8_e5m2 - and ATTENTION != "flashinfer" - and SYSTEM != "cuda" + and (ATTENTION != "flashinfer" + or SYSTEM != "cuda") ): raise ValueError( "float8_e5m2 KV cache is currently only supported for flashinfer on CUDA"