diff --git a/launcher/src/main.rs b/launcher/src/main.rs index 22bdda3b..8e5c9dcd 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -104,6 +104,9 @@ fn resolve_attention(config: &Option, lora_adapters: &Option) -> tracing::info!("Forcing flash decoding because head dim is not supported by flashinfer, also disabling prefix caching"); attention = Some("flashdecoding".to_string()); } + if prefix_caching.is_none() { + prefix_caching = Some("0".to_string()); + } } } } diff --git a/router/src/server.rs b/router/src/server.rs index f273a786..6b7fff81 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -1748,7 +1748,7 @@ pub async fn run( let mut tokenizer = Tokenizer::from_file(filename).ok(); if let Some(tokenizer) = &mut tokenizer { if let Some(class) = &tokenizer_config.tokenizer_class { - if class == "LlamaTokenizer" || class == "LlamaTokenizerFast"{ + if class == "LlamaTokenizer" || class == "LlamaTokenizerFast" || class == "CohereTokenizerFast"{ if let Ok(post_processor) = create_post_processor(tokenizer, &tokenizer_config) { tracing::info!("Overriding LlamaTokenizer with TemplateProcessing to follow python override defined in https://github.com/huggingface/transformers/blob/4aa17d00690b7f82c95bb2949ea57e22c35b4336/src/transformers/models/llama/tokenization_llama_fast.py#L203-L205"); tokenizer.with_post_processor(post_processor); diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py index 4fa9e66d..e03cc30d 100644 --- a/server/text_generation_server/models/__init__.py +++ b/server/text_generation_server/models/__init__.py @@ -497,15 +497,14 @@ def get_model( else -1 ) - should_use_sliding_window = ( - sliding_window is not None and sliding_window != -1 and SUPPORTS_WINDOWING + use_sliding_window = sliding_window is not None and sliding_window != -1 + needs_sliding_window = ( + max_input_tokens is not None and max_input_tokens > sliding_window ) - - if should_use_sliding_window: - if max_input_tokens is not None and max_input_tokens > sliding_window: - raise ValueError( - f"The backend {SYSTEM} does not support sliding window attention that is used by the model type {model_type}. To use this model nonetheless with the {SYSTEM} backend, please launch TGI with the argument `--max-input-tokens` smaller than sliding_window={sliding_window} (got here max_input_tokens={max_input_tokens})." - ) + if use_sliding_window and needs_sliding_window and not SUPPORTS_WINDOWING: + raise ValueError( + f"The backend {SYSTEM} does not support sliding window attention that is used by the model type {model_type}. To use this model nonetheless with the {SYSTEM} backend, please launch TGI with the argument `--max-input-tokens` smaller than sliding_window={sliding_window} (got here max_input_tokens={max_input_tokens})." + ) if model_type == DEEPSEEK_V2: if FLASH_ATTENTION: