Making prefix/flashinfer the default and testing the full release tests.
This commit is contained in:
parent
21187c27c9
commit
60719babf6
|
@ -38,7 +38,7 @@ impl BackendV3 {
|
||||||
let prefix_caching = if let Ok(prefix_caching) = std::env::var("USE_PREFIX_CACHING") {
|
let prefix_caching = if let Ok(prefix_caching) = std::env::var("USE_PREFIX_CACHING") {
|
||||||
matches!(prefix_caching.as_str(), "true" | "1")
|
matches!(prefix_caching.as_str(), "true" | "1")
|
||||||
} else {
|
} else {
|
||||||
false
|
true
|
||||||
};
|
};
|
||||||
let attention = if let Ok(attention) = std::env::var("ATTENTION") {
|
let attention = if let Ok(attention) = std::env::var("ATTENTION") {
|
||||||
attention
|
attention
|
||||||
|
|
|
@ -5,7 +5,7 @@ from typing import Dict, Optional
|
||||||
|
|
||||||
from text_generation_server.utils.log import log_master
|
from text_generation_server.utils.log import log_master
|
||||||
|
|
||||||
PREFIX_CACHING = os.getenv("USE_PREFIX_CACHING", "0").lower() in {"1", "true"}
|
PREFIX_CACHING = os.getenv("USE_PREFIX_CACHING", "1").lower() in {"1", "true"}
|
||||||
log_master(logger.info, f"Using prefix caching = {PREFIX_CACHING}")
|
log_master(logger.info, f"Using prefix caching = {PREFIX_CACHING}")
|
||||||
ATTENTION = os.getenv("ATTENTION", "flashinfer" if PREFIX_CACHING else "paged")
|
ATTENTION = os.getenv("ATTENTION", "flashinfer" if PREFIX_CACHING else "paged")
|
||||||
_expected = {"paged", "flashdecoding", "flashinfer"}
|
_expected = {"paged", "flashdecoding", "flashinfer"}
|
||||||
|
|
Loading…
Reference in New Issue