diff --git a/backends/v3/src/backend.rs b/backends/v3/src/backend.rs index cbcbff72..0d18fbff 100644 --- a/backends/v3/src/backend.rs +++ b/backends/v3/src/backend.rs @@ -38,7 +38,7 @@ impl BackendV3 { let prefix_caching = if let Ok(prefix_caching) = std::env::var("USE_PREFIX_CACHING") { matches!(prefix_caching.as_str(), "true" | "1") } else { - false + true }; let attention = if let Ok(attention) = std::env::var("ATTENTION") { attention diff --git a/server/text_generation_server/models/globals.py b/server/text_generation_server/models/globals.py index d5133f5e..4132ca73 100644 --- a/server/text_generation_server/models/globals.py +++ b/server/text_generation_server/models/globals.py @@ -5,7 +5,7 @@ from typing import Dict, Optional from text_generation_server.utils.log import log_master -PREFIX_CACHING = os.getenv("USE_PREFIX_CACHING", "0").lower() in {"1", "true"} +PREFIX_CACHING = os.getenv("USE_PREFIX_CACHING", "1").lower() in {"1", "true"} log_master(logger.info, f"Using prefix caching = {PREFIX_CACHING}") ATTENTION = os.getenv("ATTENTION", "flashinfer" if PREFIX_CACHING else "paged") _expected = {"paged", "flashdecoding", "flashinfer"}