Making prefix/flashinfer the default and testing the full release tests.

This commit is contained in:
Nicolas Patry 2024-08-16 14:16:45 +02:00
parent 21187c27c9
commit 60719babf6
No known key found for this signature in database
GPG Key ID: 64AF4752B2967863
2 changed files with 2 additions and 2 deletions

View File

@ -38,7 +38,7 @@ impl BackendV3 {
let prefix_caching = if let Ok(prefix_caching) = std::env::var("USE_PREFIX_CACHING") { let prefix_caching = if let Ok(prefix_caching) = std::env::var("USE_PREFIX_CACHING") {
matches!(prefix_caching.as_str(), "true" | "1") matches!(prefix_caching.as_str(), "true" | "1")
} else { } else {
false true
}; };
let attention = if let Ok(attention) = std::env::var("ATTENTION") { let attention = if let Ok(attention) = std::env::var("ATTENTION") {
attention attention

View File

@ -5,7 +5,7 @@ from typing import Dict, Optional
from text_generation_server.utils.log import log_master from text_generation_server.utils.log import log_master
PREFIX_CACHING = os.getenv("USE_PREFIX_CACHING", "0").lower() in {"1", "true"} PREFIX_CACHING = os.getenv("USE_PREFIX_CACHING", "1").lower() in {"1", "true"}
log_master(logger.info, f"Using prefix caching = {PREFIX_CACHING}") log_master(logger.info, f"Using prefix caching = {PREFIX_CACHING}")
ATTENTION = os.getenv("ATTENTION", "flashinfer" if PREFIX_CACHING else "paged") ATTENTION = os.getenv("ATTENTION", "flashinfer" if PREFIX_CACHING else "paged")
_expected = {"paged", "flashdecoding", "flashinfer"} _expected = {"paged", "flashdecoding", "flashinfer"}