diff --git a/load_tests/common.js b/load_tests/common.js index e0a10595..d890bf67 100644 --- a/load_tests/common.js +++ b/load_tests/common.js @@ -33,13 +33,13 @@ export function get_options() { // rate: 20, // timeUnit: '1s', // }, - load_test: { - executor: 'constant-arrival-rate', - duration: '60s', - preAllocatedVUs: 100, - rate: 1, - timeUnit: '1s', - }, + // load_test: { + // executor: 'constant-arrival-rate', + // duration: '60s', + // preAllocatedVUs: 100, + // rate: 1, + // timeUnit: '1s', + // }, // breakpoint: { // executor: 'ramping-arrival-rate', //Assure load increase if the system slows // preAllocatedVUs: 300, @@ -47,12 +47,12 @@ export function get_options() { // { duration: '60s', target: 100 }, // just slowly ramp-up to a HUGE load // ], // }, - // throughput: { - // executor: 'shared-iterations', - // vus: 100, - // iterations: 200, - // maxDuration: '40s', - // }, + throughput: { + executor: 'shared-iterations', + vus: 100, + iterations: 200, + maxDuration: '40s', + }, }, }; } diff --git a/server/tests/conftest.py b/server/tests/conftest.py index d7e18d20..d99771f8 100644 --- a/server/tests/conftest.py +++ b/server/tests/conftest.py @@ -2,8 +2,8 @@ import pytest import os from text_generation_server.pb import generate_pb2 -os.environ["USE_PREFIX_CACHING"] = "0" -os.environ["ATTENTION"] = "flashdecoding" +os.environ["USE_PREFIX_CACHING"] = "1" +os.environ["ATTENTION"] = "flashinfer" @pytest.fixture diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index 4ed3f56d..4ee58038 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -43,6 +43,7 @@ from text_generation_server.models.globals import ( ATTENTION, BLOCK_SIZE, CUDA_GRAPHS, + TGI_WIGGLE_ROOM, get_adapter_to_index, ) from text_generation_server.layers.attention import Seqlen @@ -1283,7 +1284,7 @@ class FlashCausalLM(Model): num_blocks = ( # Leave 5% for some wiggle room - int((free_memory * 0.95) // total_cache_size) + int((free_memory * TGI_WIGGLE_ROOM) // total_cache_size) # Add batch.num_blocks as we allocated it above, so it is included in the peak memory. + batch_num_blocks ) diff --git a/server/text_generation_server/models/globals.py b/server/text_generation_server/models/globals.py index 1f9544a6..6c518c2c 100644 --- a/server/text_generation_server/models/globals.py +++ b/server/text_generation_server/models/globals.py @@ -14,10 +14,13 @@ assert ( ), f"Attention is not valid {ATTENTION}, expected {_expected}" log_master(logger.info, f"Using Attention = {ATTENTION}") -# if PREFIX_CACHING and ATTENTION != "flashinfer": -# raise RuntimeError("Prefix caching is only supported with flashinfer") +if PREFIX_CACHING and ATTENTION not in {"flashinfer", "flashdecoding"}: + raise RuntimeError("Prefix caching is only supported with flashinfer") MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None +TGI_WIGGLE_ROOM = float(os.getenv("TGI_WIGGLE_ROOM", "0.95")) +assert TGI_WIGGLE_ROOM > 0 +assert TGI_WIGGLE_ROOM < 1 # This is overridden by the cli BLOCK_SIZE: int