diff --git a/server/text_generation_server/layers/attention/kv_cache.py b/server/text_generation_server/layers/attention/kv_cache.py index 93d74732..67105057 100644 --- a/server/text_generation_server/layers/attention/kv_cache.py +++ b/server/text_generation_server/layers/attention/kv_cache.py @@ -52,13 +52,22 @@ class KVCache: device: torch.device, ): """Construct the key-value cache for a layer.""" + if dtype in {torch.float8_e5m2, torch.float8_e4m3fn}: + if (ATTENTION == "flashinfer" and SYSTEM == "cuda") or not ( + ATTENTION == "paged" and SYSTEM == "rocm" + ): + raise ValueError( + "FP8 KV cache is currently only supported for flashinfer on CUDA and paged attention on ROCM" + ) + if SYSTEM == "rocm" and dtype == torch.float8_e5m2: + raise ValueError( + "float8_e5m2 FP8 KV cache is not supported on AMD Rocm" + ) - if dtype in {torch.float8_e5m2, torch.float8_e4m3fn} and ( - ATTENTION != "flashinfer" or SYSTEM != "cuda" - ): - raise ValueError( - "FP8 KV cache is currently only supported for flashinfer on CUDA" - ) + self.kv_cache_dtype_str = "auto" + if SYSTEM == "rocm" and dtype == torch.float8_e4m3fn: + self.kv_cache_dtype_str = "fp8" + dtype = torch.uint8 element_size = torch.tensor([], dtype=dtype).element_size() if SYSTEM == "ipex" and device.type == "xpu": @@ -120,6 +129,16 @@ class KVCache: "Using FP8 KV cache scales", ) return True + elif ( + self.kv_cache_dtype_str == "fp8" + and ATTENTION == "paged" + and SYSTEM == "rocm" + ): + log_once( + logger.info, + "Using FP8 KV cache scales", + ) + return True else: # We have scales, but not the correct FP8 cache type, so warn once. log_once( @@ -158,7 +177,7 @@ class KVCache: key_cache = self.kv_cache[0] value_cache = self.kv_cache[1] - if self.can_scale(kv_scales): + if self.can_scale(kv_scales) and SYSTEM == "cuda": if kv_scales.key_scale_cpu != 1.0: key = fp8_quantize( key.float(), @@ -188,7 +207,16 @@ class KVCache: key_cache.view(-1, shape[-2], shape[-1])[slots] = key value_cache.view(-1, shape[-2], shape[-1])[slots] = value else: - paged_reshape_and_cache(key, value, key_cache, value_cache, slots) + paged_reshape_and_cache( + key, + value, + key_cache, + value_cache, + slots, + self.kv_cache_dtype_str, + kv_scales.key_scale_cpu, + kv_scales.value_scale_cpu, + ) def paged_reshape_and_cache( @@ -197,7 +225,11 @@ def paged_reshape_and_cache( key_cache: torch.Tensor, value_cache: torch.Tensor, slots: torch.Tensor, + kv_cache_dtype: str = "auto", + k_scale: float = 1.0, + v_scale: float = 1.0, ): + if SYSTEM == "cuda": try: import attention_kernels @@ -216,7 +248,7 @@ def paged_reshape_and_cache( f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}" ) ops.reshape_and_cache( - key, value, key_cache, value_cache, slots, "auto", 1.0, 1.0 + key, value, key_cache, value_cache, slots, kv_cache_dtype, k_scale, v_scale ) elif SYSTEM == "ipex": import intel_extension_for_pytorch as ipex diff --git a/server/text_generation_server/layers/attention/rocm.py b/server/text_generation_server/layers/attention/rocm.py index 0cfac25b..bc790f06 100644 --- a/server/text_generation_server/layers/attention/rocm.py +++ b/server/text_generation_server/layers/attention/rocm.py @@ -119,9 +119,9 @@ def paged_attention( block_size, max_s, None, - "auto", - 1.0, - 1.0, + kv_cache.kv_cache_dtype_str, + kv_scales.key_scale_cpu, + kv_scales.value_scale_cpu, ) else: # Run PagedAttention V2. @@ -154,9 +154,9 @@ def paged_attention( block_size, max_s, None, - "auto", - 1.0, - 1.0, + kv_cache.kv_cache_dtype_str, + kv_scales.key_scale_cpu, + kv_scales.value_scale_cpu, ) else: ops.paged_attention_rocm( @@ -174,9 +174,9 @@ def paged_attention( block_size, max_s, None, - "auto", - 1.0, - 1.0, + kv_cache.kv_cache_dtype_str, + kv_scales.key_scale_cpu, + kv_scales.value_scale_cpu, None, _PARTITION_SIZE, ) diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py index 10309006..53df59df 100644 --- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py @@ -398,10 +398,16 @@ class LlamaMLP(nn.Module): return self.down_proj(out, adapter_data) else: gate_up_states = self.gate_up_proj(hidden_states, adapter_data) - gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size) - return self.down_proj( - self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data + output_shape = gate_up_states.shape[:-1] + (self.intermediate_size,) + out = torch.empty( + output_shape, dtype=gate_up_states.dtype, device=gate_up_states.device ) + ops.silu_and_mul(out, gate_up_states) + return self.down_proj(out, adapter_data) + # gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size) + # return self.down_proj( + # self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data + # ) class FlashLlamaLayer(nn.Module): diff --git a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py index a45dd1e6..1bc6c7d4 100644 --- a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py @@ -520,28 +520,68 @@ class FlashMixtralForCausalLM(torch.nn.Module): lm_head_indices: Optional[torch.Tensor] = None, adapter_data: Optional[torch.Tensor] = None, ) -> torch.Tensor: - true_max_s = max_s - if prefill_cache_indices is not None: - # Slots also need to be sliced as it has the same size as the whole kv tensor - slots = slots[prefill_cache_indices] - elif self.max_past is not None: - # Clamp in decode mode as paged attention requires clamped values whereas the flash attention - # kernel requires the true values - seqlen = seqlen.clamp(max=self.max_past_tensor) - hidden_states = self.model( - input_ids, - position_ids, - cu_seqlen_prefill, - kv_cache, - block_tables, - slots, - seqlen, - max_s, - true_max_s, - prefill_cache_indices, - ) - if lm_head_indices is not None: - hidden_states = hidden_states[lm_head_indices] - logits = self.lm_head(hidden_states) + if ( + torch.distributed.get_rank() == 0 + and input_ids.shape[0] == 262144 + and cu_seqlen_prefill is not None + ): + with torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + true_max_s = max_s + if prefill_cache_indices is not None: + # Slots also need to be sliced as it has the same size as the whole kv tensor + slots = slots[prefill_cache_indices] + elif self.max_past is not None: + # Clamp in decode mode as paged attention requires clamped values whereas the flash attention + # kernel requires the true values + seqlen = seqlen.clamp(max=self.max_past_tensor) + + hidden_states = self.model( + input_ids, + position_ids, + cu_seqlen_prefill, + kv_cache, + block_tables, + slots, + seqlen, + max_s, + true_max_s, + prefill_cache_indices, + ) + if lm_head_indices is not None: + hidden_states = hidden_states[lm_head_indices] + logits = self.lm_head(hidden_states) + + prof.export_chrome_trace("/tgi/trace_mistral_prefill.json") + else: + true_max_s = max_s + if prefill_cache_indices is not None: + # Slots also need to be sliced as it has the same size as the whole kv tensor + slots = slots[prefill_cache_indices] + elif self.max_past is not None: + # Clamp in decode mode as paged attention requires clamped values whereas the flash attention + # kernel requires the true values + seqlen = seqlen.clamp(max=self.max_past_tensor) + + hidden_states = self.model( + input_ids, + position_ids, + cu_seqlen_prefill, + kv_cache, + block_tables, + slots, + seqlen, + max_s, + true_max_s, + prefill_cache_indices, + ) + if lm_head_indices is not None: + hidden_states = hidden_states[lm_head_indices] + logits = self.lm_head(hidden_states) return logits